From c2361301ecf23c0823980772d8fe0033b5dd6a85 Mon Sep 17 00:00:00 2001
From: "Holger E. Jones" <jones19@llnl.gov>
Date: Wed, 15 Jun 2016 16:56:23 -0700
Subject: [PATCH 1/9] generic thread and block IDs

---
 include/RAJA/exec-cuda/MemUtils_CUDA.hxx |   2 +-
 include/RAJA/exec-cuda/reduce_cuda.hxx   | 274 +++++++++++++----------
 2 files changed, 156 insertions(+), 120 deletions(-)

diff --git a/include/RAJA/exec-cuda/MemUtils_CUDA.hxx b/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
index d4a4b556a..7492fe70b 100644
--- a/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
+++ b/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
@@ -74,7 +74,7 @@ namespace RAJA {
 typedef double CudaReductionBlockDataType;
 
 typedef struct {
-  double val;
+  CudaReductionBlockDataType val;
   Index_type idx;
 } CudaReductionLocBlockDataType;
 
diff --git a/include/RAJA/exec-cuda/reduce_cuda.hxx b/include/RAJA/exec-cuda/reduce_cuda.hxx
index 7825fb6c2..6e938f215 100644
--- a/include/RAJA/exec-cuda/reduce_cuda.hxx
+++ b/include/RAJA/exec-cuda/reduce_cuda.hxx
@@ -362,47 +362,52 @@ class ReduceMin<cuda_reduce<BLOCK_SIZE>, T> {
   __device__ ReduceMin<cuda_reduce<BLOCK_SIZE>, T> min(T val) const {
     __shared__ T sd[BLOCK_SIZE];
 
+
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
+
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i] = m_reduced_val;
+        sd[threadId + i] = m_reduced_val;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x] = val;
+    sd[threadId] = val;
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] = RAJA_MIN(sd[threadIdx.x], sd[threadIdx.x + i]);
+      if (threadId < i) {
+        sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + i]);
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < 16) {
-      sd[threadIdx.x] = RAJA_MIN(sd[threadIdx.x], sd[threadIdx.x + 16]);
+    if (threadId < 16) {
+      sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + 16]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 8) {
-      sd[threadIdx.x] = RAJA_MIN(sd[threadIdx.x], sd[threadIdx.x + 8]);
+    if (threadId < 8) {
+      sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + 8]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 4) {
-      sd[threadIdx.x] = RAJA_MIN(sd[threadIdx.x], sd[threadIdx.x + 4]);
+    if (threadId < 4) {
+      sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + 4]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 2) {
-      sd[threadIdx.x] = RAJA_MIN(sd[threadIdx.x], sd[threadIdx.x + 2]);
+    if (threadId < 2) {
+      sd[threadId] = RAJA_MIN(sd[threadId], sd[threadId + 2]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 1) {
+    if (threadId < 1) {
       sd[0] = RAJA_MIN(sd[0], sd[1]);
       atomicMin(&(m_tallydata->tally), sd[0]);
     }
@@ -489,47 +494,51 @@ class ReduceMax<cuda_reduce<BLOCK_SIZE>, T> {
   __device__ ReduceMax<cuda_reduce<BLOCK_SIZE>, T> max(T val) const {
     __shared__ T sd[BLOCK_SIZE];
 
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
+
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i] = m_reduced_val;
+        sd[threadId + i] = m_reduced_val;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x] = val;
+    sd[threadId] = val;
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] = RAJA_MAX(sd[threadIdx.x], sd[threadIdx.x + i]);
+      if (threadId < i) {
+        sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + i]);
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < 16) {
-      sd[threadIdx.x] = RAJA_MAX(sd[threadIdx.x], sd[threadIdx.x + 16]);
+    if (threadId < 16) {
+      sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + 16]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 8) {
-      sd[threadIdx.x] = RAJA_MAX(sd[threadIdx.x], sd[threadIdx.x + 8]);
+    if (threadId < 8) {
+      sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + 8]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 4) {
-      sd[threadIdx.x] = RAJA_MAX(sd[threadIdx.x], sd[threadIdx.x + 4]);
+    if (threadId < 4) {
+      sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + 4]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 2) {
-      sd[threadIdx.x] = RAJA_MAX(sd[threadIdx.x], sd[threadIdx.x + 2]);
+    if (threadId < 2) {
+      sd[threadId] = RAJA_MAX(sd[threadId], sd[threadId + 2]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 1) {
+    if (threadId < 1) {
       sd[0] = RAJA_MAX(sd[0], sd[1]);
       atomicMax(&(m_tallydata->tally), sd[0]);
     }
@@ -641,34 +650,44 @@ class ReduceSum<cuda_reduce<BLOCK_SIZE>, T> {
   __device__ ReduceSum<cuda_reduce<BLOCK_SIZE>, T> operator+=(T val) const {
     __shared__ T sd[BLOCK_SIZE];
 
-    if (blockDim.x * blockIdx.x + threadIdx.x == 0) {
-      m_max_grid_size[0] = RAJA_MAX(gridDim.x, m_max_grid_size[0]);
+    int blockId = blockIdx.x 
+       + blockIdx.y * gridDim.x 
+       + gridDim.x * gridDim.y * blockIdx.z; 
+
+
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
+
+
+    if (blockId  + threadId == 0) {
+      m_max_grid_size[0] = RAJA_MAX(gridDim.x*gridDim.y*gridDim.z, m_max_grid_size[0]);
     }
 
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i] = m_reduced_val;
+        sd[threadId + i] = m_reduced_val;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x] = val;
+    sd[threadId] = val;
 
     T temp = 0;
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] += sd[threadIdx.x + i];
+      if (threadId < i) {
+        sd[threadId] += sd[threadId + i];
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < WARP_SIZE) {
-      temp = sd[threadIdx.x];
+    if (threadId < WARP_SIZE) {
+      temp = sd[threadId];
       for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
         temp += shfl_xor(temp, i);
       }
@@ -676,8 +695,8 @@ class ReduceSum<cuda_reduce<BLOCK_SIZE>, T> {
 
     // one thread adds to gmem, we skip m_blockdata[m_blockoffset]
     // because we will be accumlating into this
-    if (threadIdx.x == 0) {
-      m_blockdata[m_blockoffset + blockIdx.x + 1] += temp;
+    if (threadId == 0) {
+      m_blockdata[m_blockoffset + blockId + 1] += temp;
     }
 
     return *this;
@@ -769,37 +788,42 @@ class ReduceSum<cuda_reduce_atomic<BLOCK_SIZE>, T> {
       T val) const {
     __shared__ T sd[BLOCK_SIZE];
 
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
+
+
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i] = m_reduced_val;
+        sd[threadId + i] = m_reduced_val;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x] = val;
+    sd[threadId] = val;
 
     T temp = 0;
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] += sd[threadIdx.x + i];
+      if (threadId < i) {
+        sd[threadId] += sd[threadId + i];
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < WARP_SIZE) {
-      temp = sd[threadIdx.x];
+    if (threadId < WARP_SIZE) {
+      temp = sd[threadId];
       for (int i = WARP_SIZE / 2; i > 0; i /= 2) {
         temp += shfl_xor(temp, i);
       }
     }
 
     // one thread adds to tally
-    if (threadIdx.x == 0) {
+    if (threadId == 0) {
       atomicAdd(&(m_tallydata->tally), temp);
     }
 
@@ -906,104 +930,110 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
     __shared__ CudaReductionLocBlockDataType sd[BLOCK_SIZE];
     __shared__ bool lastBlock;
 
+    int blockId = blockIdx.x 
+       + blockIdx.y * gridDim.x 
+       + gridDim.x * gridDim.y * blockIdx.z; 
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i].val = m_reduced_val;
-        sd[threadIdx.x + i].idx = m_reduced_idx;
+        sd[threadId + i].val = m_reduced_val;
+        sd[threadId + i].idx = m_reduced_idx;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x].val = val;
-    sd[threadIdx.x].idx = idx;  // need to reconcile loc vs idx naming
+    sd[threadId].val = val;
+    sd[threadId].idx = idx;  // need to reconcile loc vs idx naming
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + i]);
+      if (threadId < i) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + i]);
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < 16) {
-      sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 16]);
+    if (threadId < 16) {
+      sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 16]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 8) {
-      sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 8]);
+    if (threadId < 8) {
+      sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 8]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 4) {
-      sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 4]);
+    if (threadId < 4) {
+      sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 4]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 2) {
-      sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 2]);
+    if (threadId < 2) {
+      sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 2]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 1) {
+    if (threadId < 1) {
       lastBlock = false;
-      sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 1]);
-      m_blockdata[m_blockoffset + blockIdx.x + 1] =
-          RAJA_MINLOC(sd[threadIdx.x],
-                      m_blockdata[m_blockoffset + blockIdx.x + 1]);
+      sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 1]);
+      m_blockdata[m_blockoffset + blockId + 1] =
+          RAJA_MINLOC(sd[threadId],
+                      m_blockdata[m_blockoffset + blockId + 1]);
       int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], (int)1);
       lastBlock = (oldBlockCount == (gridDim.x - 1));
     }
     __syncthreads();
 
     if (lastBlock) {
-      if (threadIdx.x == 0) {
+      if (threadId == 0) {
         retiredBlocks[m_myID] = 0;
       }
 
       CudaReductionLocBlockDataType lmin;
       lmin.val = m_reduced_val;
       lmin.idx = m_reduced_idx;
-      for (int i = threadIdx.x; i < gridDim.x; i += BLOCK_SIZE) {
+      for (int i = threadId; i < gridDim.x; i += BLOCK_SIZE) {
         lmin = RAJA_MINLOC(lmin, m_blockdata[m_blockoffset + i + 1]);
       }
-      sd[threadIdx.x] = lmin;
+      sd[threadId] = lmin;
       __syncthreads();
 
       for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadIdx.x < i) {
-          sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + i]);
+        if (threadId < i) {
+          sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + i]);
         }
         __syncthreads();
       }
 
-      if (threadIdx.x < 16) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 16]);
+      if (threadId < 16) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 16]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 8) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 8]);
+      if (threadId < 8) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 8]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 4) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 4]);
+      if (threadId < 4) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 4]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 2) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 2]);
+      if (threadId < 2) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 2]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 1) {
-        sd[threadIdx.x] = RAJA_MINLOC(sd[threadIdx.x], sd[threadIdx.x + 1]);
+      if (threadId < 1) {
+        sd[threadId] = RAJA_MINLOC(sd[threadId], sd[threadId + 1]);
         m_blockdata[m_blockoffset] =
-            RAJA_MINLOC(m_blockdata[m_blockoffset], sd[threadIdx.x]);
+            RAJA_MINLOC(m_blockdata[m_blockoffset], sd[threadId]);
       }
     }
     return *this;
@@ -1102,104 +1132,110 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
     __shared__ CudaReductionLocBlockDataType sd[BLOCK_SIZE];
     __shared__ bool lastBlock;
 
+    int blockId = blockIdx.x 
+       + blockIdx.y * gridDim.x 
+       + gridDim.x * gridDim.y * blockIdx.z; 
+    int threadId = threadIdx.x + 
+               blockDim.x * threadIdx.y + 
+               (blockDim.x * blockDim.y) * threadIdx.z;
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
-      if (threadIdx.x < i) {
+      if (threadId < i) {
         // no need for __syncthreads()
-        sd[threadIdx.x + i].val = m_reduced_val;
-        sd[threadIdx.x + i].idx = m_reduced_idx;
+        sd[threadId + i].val = m_reduced_val;
+        sd[threadId + i].idx = m_reduced_idx;
       }
     }
     __syncthreads();
 
-    sd[threadIdx.x].val = val;
-    sd[threadIdx.x].idx = idx;  // need to reconcile loc vs idx naming
+    sd[threadId].val = val;
+    sd[threadId].idx = idx;  // need to reconcile loc vs idx naming
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-      if (threadIdx.x < i) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + i]);
+      if (threadId < i) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + i]);
       }
       __syncthreads();
     }
 
-    if (threadIdx.x < 16) {
-      sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 16]);
+    if (threadId < 16) {
+      sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 16]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 8) {
-      sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 8]);
+    if (threadId < 8) {
+      sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 8]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 4) {
-      sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 4]);
+    if (threadId < 4) {
+      sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 4]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 2) {
-      sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 2]);
+    if (threadId < 2) {
+      sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 2]);
     }
     __syncthreads();
 
-    if (threadIdx.x < 1) {
+    if (threadId < 1) {
       lastBlock = false;
-      sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 1]);
-      m_blockdata[m_blockoffset + blockIdx.x + 1] =
-          RAJA_MAXLOC(sd[threadIdx.x],
-                      m_blockdata[m_blockoffset + blockIdx.x + 1]);
+      sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 1]);
+      m_blockdata[m_blockoffset + blockId + 1] =
+          RAJA_MAXLOC(sd[threadId],
+                      m_blockdata[m_blockoffset + blockId + 1]);
       unsigned int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], 1);
       lastBlock = (oldBlockCount == (gridDim.x - 1));
     }
     __syncthreads();
 
     if (lastBlock) {
-      if (threadIdx.x == 0) {
+      if (threadId == 0) {
         retiredBlocks[m_myID] = 0;
       }
 
       CudaReductionLocBlockDataType lmax;
       lmax.val = m_reduced_val;
       lmax.idx = m_reduced_idx;
-      for (int i = threadIdx.x; i < gridDim.x; i += BLOCK_SIZE) {
+      for (int i = threadId; i < gridDim.x; i += BLOCK_SIZE) {
         lmax = RAJA_MAXLOC(lmax, m_blockdata[m_blockoffset + i + 1]);
       }
-      sd[threadIdx.x] = lmax;
+      sd[threadId] = lmax;
       __syncthreads();
 
       for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
-        if (threadIdx.x < i) {
-          sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + i]);
+        if (threadId < i) {
+          sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + i]);
         }
         __syncthreads();
       }
 
-      if (threadIdx.x < 16) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 16]);
+      if (threadId < 16) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 16]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 8) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 8]);
+      if (threadId < 8) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 8]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 4) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 4]);
+      if (threadId < 4) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 4]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 2) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 2]);
+      if (threadId < 2) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 2]);
       }
       __syncthreads();
 
-      if (threadIdx.x < 1) {
-        sd[threadIdx.x] = RAJA_MAXLOC(sd[threadIdx.x], sd[threadIdx.x + 1]);
+      if (threadId < 1) {
+        sd[threadId] = RAJA_MAXLOC(sd[threadId], sd[threadId + 1]);
         m_blockdata[m_blockoffset] =
-            RAJA_MAXLOC(m_blockdata[m_blockoffset], sd[threadIdx.x]);
+            RAJA_MAXLOC(m_blockdata[m_blockoffset], sd[threadId]);
       }
     }
     return *this;
-- 
GitLab


From d2aed47b765ddbbb3e3974f5e1074709d4ae3cc5 Mon Sep 17 00:00:00 2001
From: "Holger E. Jones" <jones19@llnl.gov>
Date: Wed, 22 Jun 2016 16:07:21 -0700
Subject: [PATCH 2/9] nested.exe GPU Unit test now exercises Reductions

---
 include/RAJA/exec-cuda/reduce_cuda.hxx | 24 ++++----
 test/unit-tests/GPUtests/Nested.cxx    | 81 +++++++++++++++++++++++---
 2 files changed, 86 insertions(+), 19 deletions(-)

diff --git a/include/RAJA/exec-cuda/reduce_cuda.hxx b/include/RAJA/exec-cuda/reduce_cuda.hxx
index 12b2bd041..df5831a79 100644
--- a/include/RAJA/exec-cuda/reduce_cuda.hxx
+++ b/include/RAJA/exec-cuda/reduce_cuda.hxx
@@ -941,7 +941,7 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
   // Operator to retrieve index value of min (before object is destroyed).
   //
   Index_type getMinLoc() {
-    cudaErrchk(cudaDeviceSynchronize());  // it would be good not to call this
+    cudaErrchk(cudaDeviceSynchronize());  
     m_reduced_idx = m_blockdata[m_blockoffset].idx;
     return m_reduced_idx;
   }
@@ -973,7 +973,7 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
     __syncthreads();
 
     sd[threadId].val = val;
-    sd[threadId].idx = idx;  // need to reconcile loc vs idx naming
+    sd[threadId].idx = idx; 
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
@@ -1010,7 +1010,7 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
           RAJA_MINLOC(sd[threadId],
                       m_blockdata[m_blockoffset + blockId + 1]);
       int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], (int)1);
-      lastBlock = (oldBlockCount == (gridDim.x - 1));
+      lastBlock = (oldBlockCount == ((gridDim.x * gridDim.y * gridDim.z)- 1));
     }
     __syncthreads();
 
@@ -1019,10 +1019,10 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
         retiredBlocks[m_myID] = 0;
       }
 
-      CudaReductionLocBlockDataType lmin;
-      lmin.val = m_reduced_val;
-      lmin.idx = m_reduced_idx;
-      for (int i = threadId; i < gridDim.x; i += BLOCK_SIZE) {
+      CudaReductionLocBlockDataType lmin={m_reduced_val,m_reduced_idx};
+      int blocks = gridDim.x * gridDim.y * gridDim.z;
+      int threads = blockDim.x * blockDim.y * blockDim.z;
+      for (int i = threadId; i < blocks; i += threads) {
         lmin = RAJA_MINLOC(lmin, m_blockdata[m_blockoffset + i + 1]);
       }
       sd[threadId] = lmin;
@@ -1218,7 +1218,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
           RAJA_MAXLOC(sd[threadId],
                       m_blockdata[m_blockoffset + blockId + 1]);
       unsigned int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], 1);
-      lastBlock = (oldBlockCount == (gridDim.x - 1));
+      lastBlock = (oldBlockCount == ((gridDim.x * gridDim.y * gridDim.z) - 1));
     }
     __syncthreads();
 
@@ -1227,10 +1227,10 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
         retiredBlocks[m_myID] = 0;
       }
 
-      CudaReductionLocBlockDataType lmax;
-      lmax.val = m_reduced_val;
-      lmax.idx = m_reduced_idx;
-      for (int i = threadId; i < gridDim.x; i += BLOCK_SIZE) {
+      CudaReductionLocBlockDataType lmax={m_reduced_val,m_reduced_idx};
+      int blocks = gridDim.x * gridDim.y * gridDim.z;
+      int threads = blockDim.x * blockDim.y * blockDim.z;
+      for (int i = threadId; i < blocks; i += threads) {
         lmax = RAJA_MAXLOC(lmax, m_blockdata[m_blockoffset + i + 1]);
       }
       sd[threadId] = lmax;
diff --git a/test/unit-tests/GPUtests/Nested.cxx b/test/unit-tests/GPUtests/Nested.cxx
index 4016bc2be..f96e71400 100644
--- a/test/unit-tests/GPUtests/Nested.cxx
+++ b/test/unit-tests/GPUtests/Nested.cxx
@@ -13,6 +13,7 @@
 //
 
 #include <cstdlib>
+#include <cfloat>
 #include <time.h>
 
 #include <string>
@@ -26,6 +27,12 @@ using namespace std;
 
 #include "Compare.hxx"
 
+
+typedef struct {
+  double val;
+  int idx;
+} minmaxloc_t;
+
 //
 // Global variables for counting tests executed/passed.
 //
@@ -35,6 +42,10 @@ unsigned s_ntests_passed_total = 0;
 unsigned s_ntests_run = 0;
 unsigned s_ntests_passed = 0;
 
+// block_size is needed by the reduction variables to setup shared memory
+// Care should be used here to cover the maximum block dimensions used by this test
+const size_t block_size = 256;
+
 ///////////////////////////////////////////////////////////////////////////
 //
 // Example LTimes kernel test routines
@@ -71,6 +82,21 @@ void runLTimesTest(std::string const &policy,
   std::vector<double> psi_data(num_directions * num_groups * num_zones);
   std::vector<double> phi_data(num_moments * num_groups * num_zones, 0.0);
 
+  // setup CUDA Reduction variables to be exercised
+  ReduceSum<cuda_reduce<block_size>,double> pdsum(0.0);
+  ReduceMin<cuda_reduce<block_size>,double> pdmin(DBL_MAX);
+  ReduceMax<cuda_reduce<block_size>,double> pdmax(-DBL_MAX);
+  ReduceMinLoc<cuda_reduce<block_size>,double> pdminloc(DBL_MAX,-1);
+  ReduceMaxLoc<cuda_reduce<block_size>,double> pdmaxloc(-DBL_MAX,-1);
+
+  // setup local Reduction variables as a crosscheck
+  double lsum=0.0;
+  double lmin=DBL_MAX;
+  double lmax=-DBL_MAX;
+  minmaxloc_t lminloc={DBL_MAX,-1};
+  minmaxloc_t lmaxloc={-DBL_MAX,-1};
+
+  //
   // randomize data
   for (size_t i = 0; i < ell_data.size(); ++i) {
     ell_data[i] = drand48();
@@ -115,11 +141,17 @@ void runLTimesTest(std::string const &policy,
       RangeSegment(0, num_zones),
       [=] __device__(IMoment m, IDirection d, IGroup g, IZone z) {
         // printf("%d,%d,%d,%d\n", *m, *d, *g, *z);
-        phi(m, g, z) += ell(m, d) * psi(d, g, z);
+        double val = ell(m,d) * psi(d,g,z); 
+        phi(m,g,z) += val; 
+        pdsum += val;
+        pdmin.min(val);
+        pdmax.max(val);
+        int index = *d + (*m * num_directions) + (*g * num_directions * num_moments) + (*z * num_directions * num_moments * num_groups);
+        pdminloc.minloc(val,index);
+        pdmaxloc.maxloc(val,index);
       });
 
   cudaDeviceSynchronize();
-
   // Copy to host the result
   cudaMemcpy(&phi_data[0],
              d_phi,
@@ -140,15 +172,21 @@ void runLTimesTest(std::string const &policy,
   ell.data = &ell_data[0];
   phi.data = &phi_data[0];
   psi.data = &psi_data[0];
-
   for (IZone z(0); z < num_zones; ++z) {
     for (IGroup g(0); g < num_groups; ++g) {
       for (IMoment m(0); m < num_moments; ++m) {
         double total = 0.0;
         for (IDirection d(0); d < num_directions; ++d) {
-          total += ell(m, d) * psi(d, g, z);
+          double val = ell(m,d) * psi(d,g,z);
+          total += val;
+          lmin = RAJA_MIN(lmin,val);
+          lmax = RAJA_MAX(lmax,val);
+          int index = *d + (*m * num_directions) + (*g * num_directions * num_moments) + (*z * num_directions * num_moments * num_groups);
+          minmaxloc_t testMinMaxLoc={val,index};
+          lminloc = RAJA_MINLOC(lminloc,testMinMaxLoc);
+          lmaxloc = RAJA_MAXLOC(lmaxloc,testMinMaxLoc);
         }
-
+        lsum += total;
         // check answer with some reasonable tolerance
         if (std::abs(total - phi(m, g, z)) > 1e-12) {
           nfailed++;
@@ -156,10 +194,39 @@ void runLTimesTest(std::string const &policy,
       }
     }
   }
+  size_t reductionsFailed = 0;
+  std::string whichFailed;
 
-  if (nfailed) {
-    cout << "\n TEST FAILURE: " << nfailed << " elements failed" << endl;
+  if (std::abs(lsum - double(pdsum)) > 1e-9) {
+    reductionsFailed++;
+    whichFailed += "[ReduceSum]";
+  }
+
+  if(lmin != double(pdmin)) {
+    reductionsFailed++;
+    whichFailed += "[ReduceMin]";
+  }  
+
+  if(lmax != double(pdmax)) {
+    reductionsFailed++;
+    whichFailed += "[ReduceMax]";
+  }
+
+  if((lminloc.val != double(pdminloc)) && (lminloc.idx != pdminloc.getMinLoc())) {
+    reductionsFailed++;
+    whichFailed += "[ReduceMinLoc]";
+  }
+
+  if((lmaxloc.val != double(pdmaxloc)) && (lmaxloc.idx != pdmaxloc.getMaxLoc())) {
+    reductionsFailed++;
+    whichFailed += "[ReduceMaxLoc]";
+  }
 
+  if (nfailed || reductionsFailed) {
+    cout << "\n TEST FAILURE: " << nfailed << " elements failed" << endl;
+    if(reductionsFailed) {
+      cout << "  REDUCTIONS FAILURE: " << whichFailed << endl; 
+    }
   } else {
     s_ntests_passed++;
     s_ntests_passed_total++;
-- 
GitLab


From e5ea4f72635abcf2df0bb2021c5ee08b2e3cee2b Mon Sep 17 00:00:00 2001
From: "Holger E. Jones" <jones19@llnl.gov>
Date: Mon, 27 Jun 2016 10:28:23 -0700
Subject: [PATCH 3/9] minmaxloc needed threadfence before atomicAdd

---
 include/RAJA/exec-cuda/reduce_cuda.hxx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/RAJA/exec-cuda/reduce_cuda.hxx b/include/RAJA/exec-cuda/reduce_cuda.hxx
index df5831a79..2d3be0213 100644
--- a/include/RAJA/exec-cuda/reduce_cuda.hxx
+++ b/include/RAJA/exec-cuda/reduce_cuda.hxx
@@ -1009,6 +1009,7 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
       m_blockdata[m_blockoffset + blockId + 1] =
           RAJA_MINLOC(sd[threadId],
                       m_blockdata[m_blockoffset + blockId + 1]);
+      __threadfence();
       int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], (int)1);
       lastBlock = (oldBlockCount == ((gridDim.x * gridDim.y * gridDim.z)- 1));
     }
@@ -1133,6 +1134,11 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
 #endif
       // OK to perform cudaFree of cudaMalloc vars if needed...
     }
+//    else{
+//#if defined(__CUDA_ARCH__)
+//      printf("~ReduceMaxLoc\n");
+//#endif
+//    }
   }
 
   //
@@ -1217,6 +1223,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
       m_blockdata[m_blockoffset + blockId + 1] =
           RAJA_MAXLOC(sd[threadId],
                       m_blockdata[m_blockoffset + blockId + 1]);
+      __threadfence();
       unsigned int oldBlockCount = atomicAdd(&retiredBlocks[m_myID], 1);
       lastBlock = (oldBlockCount == ((gridDim.x * gridDim.y * gridDim.z) - 1));
     }
@@ -1230,6 +1237,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
       CudaReductionLocBlockDataType lmax={m_reduced_val,m_reduced_idx};
       int blocks = gridDim.x * gridDim.y * gridDim.z;
       int threads = blockDim.x * blockDim.y * blockDim.z;
+
       for (int i = threadId; i < blocks; i += threads) {
         lmax = RAJA_MAXLOC(lmax, m_blockdata[m_blockoffset + i + 1]);
       }
-- 
GitLab


From 0537b341bbfc5ec11b75dd84ca5f04001b216b46 Mon Sep 17 00:00:00 2001
From: "Holger E. Jones" <jones19@llnl.gov>
Date: Thu, 30 Jun 2016 14:19:31 -0700
Subject: [PATCH 4/9] static assert BLOCK_SIZE HWM switched to 1024, other
 cleanup

---
 include/RAJA/exec-cuda/MemUtils_CUDA.hxx |  1 +
 include/RAJA/exec-cuda/reduce_cuda.hxx   | 42 +++++++++---------------
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/include/RAJA/exec-cuda/MemUtils_CUDA.hxx b/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
index 7492fe70b..65a434766 100644
--- a/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
+++ b/include/RAJA/exec-cuda/MemUtils_CUDA.hxx
@@ -78,6 +78,7 @@ typedef struct {
   Index_type idx;
 } CudaReductionLocBlockDataType;
 
+
 typedef struct {
   CudaReductionBlockDataType tally;
   CudaReductionBlockDataType initVal;
diff --git a/include/RAJA/exec-cuda/reduce_cuda.hxx b/include/RAJA/exec-cuda/reduce_cuda.hxx
index 2d3be0213..e9fee62b8 100644
--- a/include/RAJA/exec-cuda/reduce_cuda.hxx
+++ b/include/RAJA/exec-cuda/reduce_cuda.hxx
@@ -59,7 +59,7 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
+#include <cassert>
 #include "RAJA/int_datatypes.hxx"
 
 #include "RAJA/reducers.hxx"
@@ -342,7 +342,6 @@ class ReduceMin<cuda_reduce<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
   }
 
@@ -431,9 +430,9 @@ class ReduceMin<cuda_reduce<BLOCK_SIZE>, T> {
   
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 /*!
@@ -480,7 +479,6 @@ class ReduceMax<cuda_reduce<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
   }
 
@@ -568,9 +566,9 @@ class ReduceMax<cuda_reduce<BLOCK_SIZE>, T> {
   
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 /*!
@@ -634,7 +632,6 @@ class ReduceSum<cuda_reduce<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
   }
 
@@ -648,6 +645,7 @@ class ReduceSum<cuda_reduce<BLOCK_SIZE>, T> {
     m_blockdata[m_blockoffset] = static_cast<T>(0);
 
     size_t grid_size = m_max_grid_size[0];
+    assert(grid_size < RAJA_CUDA_REDUCE_BLOCK_LENGTH);
     for (size_t i = 1; i <= grid_size; ++i) {
       m_blockdata[m_blockoffset] += m_blockdata[m_blockoffset + i];
     }
@@ -734,9 +732,9 @@ class ReduceSum<cuda_reduce<BLOCK_SIZE>, T> {
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 /*!
@@ -785,7 +783,6 @@ class ReduceSum<cuda_reduce_atomic<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
   }
 
@@ -866,9 +863,9 @@ class ReduceSum<cuda_reduce_atomic<BLOCK_SIZE>, T> {
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 ///
@@ -923,7 +920,6 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
   }
 
@@ -1084,9 +1080,9 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 template <size_t BLOCK_SIZE, typename T>
@@ -1132,13 +1128,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
 #else
       releaseCudaReductionId(m_myID);
 #endif
-      // OK to perform cudaFree of cudaMalloc vars if needed...
     }
-//    else{
-//#if defined(__CUDA_ARCH__)
-//      printf("~ReduceMaxLoc\n");
-//#endif
-//    }
   }
 
   //
@@ -1155,7 +1145,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
   // Operator to retrieve index value of min (before object is destroyed).
   //
   Index_type getMaxLoc() {
-    cudaErrchk(cudaDeviceSynchronize());  // it would be good not to call this
+    cudaErrchk(cudaDeviceSynchronize());  
     m_reduced_idx = m_blockdata[m_blockoffset].idx;
     return m_reduced_idx;
   }
@@ -1187,7 +1177,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
     __syncthreads();
 
     sd[threadId].val = val;
-    sd[threadId].idx = idx;  // need to reconcile loc vs idx naming
+    sd[threadId].idx = idx; 
     __syncthreads();
 
     for (int i = BLOCK_SIZE / 2; i >= WARP_SIZE; i /= 2) {
@@ -1299,9 +1289,9 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=2048));
+  static constexpr bool reasonableRangeCheck = ((BLOCK_SIZE>=32) && (BLOCK_SIZE<=1024));
   static_assert(powerOfTwoCheck,"Error: block sizes must be a power of 2");
-  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 2048");
+  static_assert(reasonableRangeCheck,"Error: block sizes must be between 32 and 1024");
 };
 
 }  // closing brace for RAJA namespace
-- 
GitLab


From 57d4c3c9c3cf325c4afb95c0cb9cff88cb51492b Mon Sep 17 00:00:00 2001
From: "Holger E. Jones" <jones19@llnl.gov>
Date: Tue, 5 Jul 2016 15:16:32 -0700
Subject: [PATCH 5/9] added max grid assert in minmaxloc accessor

---
 include/RAJA/exec-cuda/reduce_cuda.hxx | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/RAJA/exec-cuda/reduce_cuda.hxx b/include/RAJA/exec-cuda/reduce_cuda.hxx
index e9fee62b8..5ab07c38d 100644
--- a/include/RAJA/exec-cuda/reduce_cuda.hxx
+++ b/include/RAJA/exec-cuda/reduce_cuda.hxx
@@ -890,6 +890,9 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
     m_myID = getCudaReductionId();
     retiredBlocks[m_myID] = 0;
     m_blockdata = getCudaReductionLocMemBlock(m_myID);
+    // we're adding max grid size calculation for an assert check in the accessor
+    m_max_grid_size = m_blockdata;
+    m_max_grid_size[0].val = 0;
     m_blockoffset = 1;
     m_blockdata[m_blockoffset].val = init_val;
     m_blockdata[m_blockoffset].idx = init_loc;
@@ -929,6 +932,8 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
   //
   operator T() {
     cudaErrchk(cudaDeviceSynchronize());
+    size_t grid_size = m_max_grid_size[0].val;
+    assert(grid_size < RAJA_CUDA_REDUCE_BLOCK_LENGTH);
     m_reduced_val = static_cast<T>(m_blockdata[m_blockoffset].val);
     return m_reduced_val;
   }
@@ -957,6 +962,11 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
     int threadId = threadIdx.x + 
                blockDim.x * threadIdx.y + 
                (blockDim.x * blockDim.y) * threadIdx.z;
+
+    if (blockId  + threadId == 0) {
+      m_max_grid_size[0].val = RAJA_MAX(gridDim.x*gridDim.y*gridDim.z, m_max_grid_size[0].val);
+    }
+
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
@@ -1077,6 +1087,7 @@ class ReduceMinLoc<cuda_reduce<BLOCK_SIZE>, T> {
   Index_type m_reduced_idx;
 
   CudaReductionLocBlockDataType *m_blockdata;
+  CudaReductionLocBlockDataType *m_max_grid_size;
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
@@ -1098,6 +1109,9 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
     m_myID = getCudaReductionId();
     retiredBlocks[m_myID] = 0;
     m_blockdata = getCudaReductionLocMemBlock(m_myID);
+    // we're adding max grid size calculation for an assert check in the accessor
+    m_max_grid_size = m_blockdata;
+    m_max_grid_size[0].val = 0;
     m_blockoffset = 1;
     m_blockdata[m_blockoffset].val = init_val;
     m_blockdata[m_blockoffset].idx = init_loc;
@@ -1137,6 +1151,8 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
   //
   operator T() {
     cudaErrchk(cudaDeviceSynchronize());
+    size_t grid_size = m_max_grid_size[0].val;
+    assert(grid_size < RAJA_CUDA_REDUCE_BLOCK_LENGTH);
     m_reduced_val = static_cast<T>(m_blockdata[m_blockoffset].val);
     return m_reduced_val;
   }
@@ -1165,6 +1181,11 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
     int threadId = threadIdx.x + 
                blockDim.x * threadIdx.y + 
                (blockDim.x * blockDim.y) * threadIdx.z;
+
+    if (blockId  + threadId == 0) {
+      m_max_grid_size[0].val = RAJA_MAX(gridDim.x*gridDim.y*gridDim.z, m_max_grid_size[0].val);
+    }
+
     // initialize shared memory
     for (int i = BLOCK_SIZE / 2; i > 0; i /= 2) {
       // this descends all the way to 1
@@ -1286,6 +1307,7 @@ class ReduceMaxLoc<cuda_reduce<BLOCK_SIZE>, T> {
   Index_type m_reduced_idx;
 
   CudaReductionLocBlockDataType *m_blockdata;
+  CudaReductionLocBlockDataType *m_max_grid_size;
 
   // Sanity checks for block size
   static constexpr bool powerOfTwoCheck = (!(BLOCK_SIZE&(BLOCK_SIZE-1))); 
-- 
GitLab


From 15557b8e567a355bea5887bb75679efad8eded38 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 28 Jun 2016 11:33:05 -0700
Subject: [PATCH 6/9] Delete example applications

---
 test/CMakeLists.txt                           |    4 -
 test/Kripke-v1.1/CMakeLists.txt               |   48 -
 .../Kripke-v1.1-RAJA/CMakeLists.txt           |   59 -
 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.cpp  |  525 ---
 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.h    |  203 -
 .../Kripke-v1.1-RAJA/Kripke/CMakeLists.txt    |   39 -
 .../Kripke-v1.1-RAJA/Kripke/DView.h           |  159 -
 .../Kripke-v1.1-RAJA/Kripke/Directions.cpp    |  209 -
 .../Kripke-v1.1-RAJA/Kripke/Directions.h      |   61 -
 .../Kripke-v1.1-RAJA/Kripke/Grid.cpp          |  561 ---
 .../Kripke-v1.1-RAJA/Kripke/Grid.h            |  120 -
 .../Kripke/Input_Variables.cpp                |  149 -
 .../Kripke-v1.1-RAJA/Kripke/Input_Variables.h |   79 -
 .../Kripke-v1.1-RAJA/Kripke/Kernel.cpp        |  521 ---
 .../Kripke-v1.1-RAJA/Kripke/Kernel.h          |   73 -
 .../Kripke/Kernel/DataPolicy.h                |  213 -
 .../Kripke/Kernel/Kernel_3d_DGZ.cpp           |   65 -
 .../Kripke/Kernel/Kernel_3d_DGZ.h             |   51 -
 .../Kripke/Kernel/Kernel_3d_DZG.cpp           |   64 -
 .../Kripke/Kernel/Kernel_3d_DZG.h             |   51 -
 .../Kripke/Kernel/Kernel_3d_GDZ.cpp           |   65 -
 .../Kripke/Kernel/Kernel_3d_GDZ.h             |   51 -
 .../Kripke/Kernel/Kernel_3d_GZD.cpp           |   64 -
 .../Kripke/Kernel/Kernel_3d_GZD.h             |   51 -
 .../Kripke/Kernel/Kernel_3d_ZDG.cpp           |   64 -
 .../Kripke/Kernel/Kernel_3d_ZDG.h             |   51 -
 .../Kripke/Kernel/Kernel_3d_ZGD.cpp           |   64 -
 .../Kripke/Kernel/Kernel_3d_ZGD.h             |   51 -
 .../Kripke/Kernel/LPlusTimesPolicy.h          |  125 -
 .../Kripke/Kernel/LTimesPolicy.h              |  122 -
 .../Kripke/Kernel/ParticleEditPolicy.h        |   70 -
 .../Kripke/Kernel/ScatteringPolicy.h          |  120 -
 .../Kripke/Kernel/SourcePolicy.h              |   86 -
 .../Kripke/Kernel/SweepPolicy.h               |  113 -
 .../Kripke-v1.1-RAJA/Kripke/KernelFunctors.h  |  262 --
 .../Kripke-v1.1-RAJA/Kripke/Layout.cpp        |  379 --
 .../Kripke-v1.1-RAJA/Kripke/Layout.h          |  101 -
 .../Kripke-v1.1-RAJA/Kripke/ParallelComm.cpp  |  283 --
 .../Kripke-v1.1-RAJA/Kripke/ParallelComm.h    |  124 -
 .../Kripke/ParallelComm/BlockJacobiComm.cpp   |  117 -
 .../Kripke/ParallelComm/SweepComm.cpp         |  101 -
 .../Kripke-v1.1-RAJA/Kripke/SubTVec.h         |  245 --
 .../Kripke-v1.1-RAJA/Kripke/Subdomain.cpp     |  598 ---
 .../Kripke-v1.1-RAJA/Kripke/Subdomain.h       |  141 -
 .../Kripke-v1.1-RAJA/Kripke/Sweep_Solver.cpp  |  190 -
 .../Kripke/Test/TestKernels.cpp               |  172 -
 .../Kripke/Test/TestKernels.h                 |   40 -
 .../Kripke-v1.1-RAJA/Kripke/Timing.cpp        |  281 --
 .../Kripke-v1.1-RAJA/Kripke/Timing.h          |  121 -
 .../Kripke-v1.1/Kripke-v1.1-RAJA/KripkeRAJA.h |  107 -
 test/Kripke-v1.1/Kripke-v1.1-RAJA/NOTICE.md   |   40 -
 test/Kripke-v1.1/Kripke-v1.1-RAJA/README.md   |  363 --
 .../Kripke-v1.1-baseline/CMakeLists.txt       |   55 -
 .../Kripke-v1.1-baseline/Kripke.cpp           |  525 ---
 .../Kripke-v1.1/Kripke-v1.1-baseline/Kripke.h |  193 -
 .../Kripke/CMakeLists.txt                     |   33 -
 .../Kripke/Directions.cpp                     |  209 -
 .../Kripke-v1.1-baseline/Kripke/Directions.h  |   61 -
 .../Kripke-v1.1-baseline/Kripke/Grid.cpp      |  561 ---
 .../Kripke-v1.1-baseline/Kripke/Grid.h        |  101 -
 .../Kripke/Input_Variables.cpp                |  149 -
 .../Kripke/Input_Variables.h                  |   79 -
 .../Kripke-v1.1-baseline/Kripke/Kernel.cpp    |   70 -
 .../Kripke-v1.1-baseline/Kripke/Kernel.h      |   70 -
 .../Kripke/Kernel/Kernel_3d_DGZ.cpp           |  383 --
 .../Kripke/Kernel/Kernel_3d_DGZ.h             |   55 -
 .../Kripke/Kernel/Kernel_3d_DZG.cpp           |  375 --
 .../Kripke/Kernel/Kernel_3d_DZG.h             |   55 -
 .../Kripke/Kernel/Kernel_3d_GDZ.cpp           |  375 --
 .../Kripke/Kernel/Kernel_3d_GDZ.h             |   55 -
 .../Kripke/Kernel/Kernel_3d_GZD.cpp           |  369 --
 .../Kripke/Kernel/Kernel_3d_GZD.h             |   55 -
 .../Kripke/Kernel/Kernel_3d_ZDG.cpp           |  367 --
 .../Kripke/Kernel/Kernel_3d_ZDG.h             |   55 -
 .../Kripke/Kernel/Kernel_3d_ZGD.cpp           |  366 --
 .../Kripke/Kernel/Kernel_3d_ZGD.h             |   55 -
 .../Kripke-v1.1-baseline/Kripke/Layout.cpp    |  379 --
 .../Kripke-v1.1-baseline/Kripke/Layout.h      |  101 -
 .../Kripke/ParallelComm.cpp                   |  283 --
 .../Kripke/ParallelComm.h                     |  124 -
 .../Kripke/ParallelComm/BlockJacobiComm.cpp   |  117 -
 .../Kripke/ParallelComm/SweepComm.cpp         |  101 -
 .../Kripke-v1.1-baseline/Kripke/SubTVec.h     |  245 --
 .../Kripke-v1.1-baseline/Kripke/Subdomain.cpp |  481 ---
 .../Kripke-v1.1-baseline/Kripke/Subdomain.h   |  125 -
 .../Kripke/Sweep_Solver.cpp                   |  190 -
 .../Kripke/Test/TestKernels.cpp               |  172 -
 .../Kripke/Test/TestKernels.h                 |   40 -
 .../Kripke-v1.1-baseline/Kripke/Timing.cpp    |  281 --
 .../Kripke-v1.1-baseline/Kripke/Timing.h      |  121 -
 .../Kripke-v1.1-baseline/NOTICE.md            |   40 -
 .../Kripke-v1.1-baseline/README.md            |  353 --
 test/LULESH-v1.0/CMakeLists.txt               |   45 -
 .../LULESH-v1.0_RAJA-variants/CMakeLists.txt  |   73 -
 .../luleshMemory.hxx                          |  178 -
 .../luleshPolicy.hxx                          |  252 --
 .../luleshRAJA-parallel-FT.cxx                | 3383 ----------------
 .../luleshRAJA-parallel.cxx                   | 3484 -----------------
 .../luleshRAJA-serial.cxx                     | 3200 ---------------
 .../LULESH-v1.0_RAJA-variants/sigcatch.cmd    |   12 -
 .../LULESH-v1.0_baseline/CMakeLists.txt       |   65 -
 .../LULESH-v1.0_baseline/lulesh.cc            | 2936 --------------
 .../LULESH-v1.0_baseline/luleshOMP.cc         | 3190 ---------------
 .../LULESH-v1.0_baseline/luleshOMP_NG.cc      | 3142 ---------------
 test/LULESH-v1.0/LULESH-v1.0_baseline/runme   |    3 -
 test/LULESH-v1.0/README                       |   42 -
 test/LULESH-v2.0/CMakeLists.txt               |   45 -
 .../LULESH-v2.0_RAJA-variants/CMakeLists.txt  |   46 -
 .../LULESH-v2.0_RAJA-IndexSet/CMakeLists.txt  |   65 -
 .../LULESH-v2.0_RAJA-IndexSet/README          |   53 -
 .../LULESH-v2.0_RAJA-IndexSet/lulesh-comm.cc  | 1837 ---------
 .../LULESH-v2.0_RAJA-IndexSet/lulesh-init.cc  |  884 -----
 .../LULESH-v2.0_RAJA-IndexSet/lulesh-util.cc  |  233 --
 .../LULESH-v2.0_RAJA-IndexSet/lulesh-viz.cc   |  432 --
 .../LULESH-v2.0_RAJA-IndexSet/lulesh.cc       | 2639 -------------
 .../LULESH-v2.0_RAJA-IndexSet/lulesh.h        |   22 -
 .../luleshMemory.hxx                          |  187 -
 .../luleshPolicy.hxx                          |   94 -
 .../LULESH-v2.0_RAJA-IndexSet/lulesh_ptr.h    |  686 ----
 .../LULESH-v2.0_RAJA-IndexSet/lulesh_stl.h    |  674 ----
 .../LULESH-v2.0_RAJA-IndexSet/lulesh_tuple.h  |  667 ----
 .../CMakeLists.txt                            |   65 -
 .../Makefile.keep                             |   73 -
 .../LULESH-v2.0_RAJA-MICfriendly/Makefile.ref |  107 -
 .../LULESH-v2.0_RAJA-MICfriendly/README       |   53 -
 .../lulesh-comm.cc                            | 1837 ---------
 .../lulesh-init.cc                            |  874 -----
 .../lulesh-util.cc                            |  232 --
 .../lulesh-viz.cc                             |  432 --
 .../LULESH-v2.0_RAJA-MICfriendly/lulesh.cc    | 2639 -------------
 .../LULESH-v2.0_RAJA-MICfriendly/lulesh.h     |   41 -
 .../LULESH-v2.0_RAJA-MICfriendly/lulesh_ptr.h |  692 ----
 .../LULESH-v2.0_RAJA-MICfriendly/lulesh_raw.h |  590 ---
 .../LULESH-v2.0_RAJA-MICfriendly/lulesh_stl.h |  679 ----
 .../lulesh_tuple.h                            |  649 ---
 .../LULESH-v2.0_RAJA-MICfriendly/subs         |   10 -
 .../LULESH-v2.0_RAJA-basic/CMakeLists.txt     |   63 -
 .../LULESH-v2.0_RAJA-basic/RAJAspecial.hxx    |   36 -
 .../LULESH-v2.0_RAJA-basic/README             |   53 -
 .../LULESH-v2.0_RAJA-basic/lulesh-comm.cc     | 1837 ---------
 .../LULESH-v2.0_RAJA-basic/lulesh-init.cc     |  739 ----
 .../LULESH-v2.0_RAJA-basic/lulesh-util.cc     |  224 --
 .../LULESH-v2.0_RAJA-basic/lulesh-viz.cc      |  432 --
 .../LULESH-v2.0_RAJA-basic/lulesh.cc          | 2727 -------------
 .../lulesh.cc.src-KEEP_FULLCONVERT            | 2737 -------------
 .../LULESH-v2.0_RAJA-basic/lulesh.h           |  632 ---
 .../LULESH-v2.0_RAJA-basic/lulesh_tuple.h     |  618 ---
 .../LULESH-v2.0_baseline/CMakeLists.txt       |   57 -
 test/LULESH-v2.0/LULESH-v2.0_baseline/README  |   53 -
 .../LULESH-v2.0_baseline/lulesh-comm.cc       | 1837 ---------
 .../LULESH-v2.0_baseline/lulesh-init.cc       |  734 ----
 .../LULESH-v2.0_baseline/lulesh-util.cc       |  224 --
 .../LULESH-v2.0_baseline/lulesh-viz.cc        |  432 --
 .../LULESH-v2.0_baseline/lulesh.cc            | 2819 -------------
 .../LULESH-v2.0/LULESH-v2.0_baseline/lulesh.h |  661 ----
 .../LULESH-v2.0_baseline/lulesh_ptr.h         |  668 ----
 .../LULESH-v2.0_baseline/lulesh_tuple.h       |  616 ---
 test/LULESH-v2.0/README                       |   41 -
 158 files changed, 71622 deletions(-)
 delete mode 100644 test/Kripke-v1.1/CMakeLists.txt
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/CMakeLists.txt
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/CMakeLists.txt
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/DView.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/DataPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LPlusTimesPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LTimesPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ParticleEditPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ScatteringPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SourcePolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SweepPolicy.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/KernelFunctors.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/BlockJacobiComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/SweepComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/SubTVec.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Sweep_Solver.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/KripkeRAJA.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/NOTICE.md
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-RAJA/README.md
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/CMakeLists.txt
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/CMakeLists.txt
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/BlockJacobiComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/SweepComm.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/SubTVec.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Sweep_Solver.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.cpp
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.h
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/NOTICE.md
 delete mode 100644 test/Kripke-v1.1/Kripke-v1.1-baseline/README.md
 delete mode 100644 test/LULESH-v1.0/CMakeLists.txt
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/CMakeLists.txt
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshMemory.hxx
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshPolicy.hxx
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel-FT.cxx
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel.cxx
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-serial.cxx
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/sigcatch.cmd
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_baseline/CMakeLists.txt
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_baseline/lulesh.cc
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP.cc
 delete mode 100644 test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP_NG.cc
 delete mode 100755 test/LULESH-v1.0/LULESH-v1.0_baseline/runme
 delete mode 100644 test/LULESH-v1.0/README
 delete mode 100644 test/LULESH-v2.0/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/README
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-comm.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-init.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-util.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-viz.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshMemory.hxx
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshPolicy.hxx
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_ptr.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_stl.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_tuple.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.keep
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.ref
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/README
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-comm.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-init.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-util.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-viz.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_ptr.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_raw.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_stl.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_tuple.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/subs
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/RAJAspecial.hxx
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/README
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-comm.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-init.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-util.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-viz.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc.src-KEEP_FULLCONVERT
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh_tuple.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/CMakeLists.txt
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/README
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-comm.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-init.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-util.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-viz.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.cc
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_ptr.h
 delete mode 100644 test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_tuple.h
 delete mode 100644 test/LULESH-v2.0/README

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3ad23e743..690b174b7 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -43,8 +43,4 @@
 
 include_directories(include)
 
-#add_subdirectory(CoMD)
-add_subdirectory(LULESH-v1.0)
-add_subdirectory(LULESH-v2.0)
-add_subdirectory(Kripke-v1.1)
 add_subdirectory(unit-tests)
diff --git a/test/Kripke-v1.1/CMakeLists.txt b/test/Kripke-v1.1/CMakeLists.txt
deleted file mode 100644
index 51be318f1..000000000
--- a/test/Kripke-v1.1/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_subdirectory(Kripke-v1.1-baseline)
-
-if(RAJA_ENABLE_OPENMP)
-  add_subdirectory(Kripke-v1.1-RAJA)
-endif()
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/CMakeLists.txt b/test/Kripke-v1.1/Kripke-v1.1-RAJA/CMakeLists.txt
deleted file mode 100644
index cd1034f74..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/CMakeLists.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-include_directories(.)
-
-add_subdirectory(Kripke)
-
-set(KRIPKE_LIBS ${KRIPKE_LIBS} RAJA) 
-
-add_definitions(-DRAJA_ENABLE_NESTED)
-
-if(RAJA_ENABLE_CUDA)
-  cuda_add_executable(Kripke-v1.1-RAJA.exe "Kripke.cpp")
-else()
-  add_executable(Kripke-v1.1-RAJA.exe "Kripke.cpp")
-endif()
-
-target_link_libraries(Kripke-v1.1-RAJA.exe ${KRIPKE_LIBS} ${KRIPKE_LIBS} ${RT_LIBRARIES})
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.cpp
deleted file mode 100644
index 280b5c254..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke.h>
-#include<Kripke/Input_Variables.h>
-#include<Kripke/Grid.h>
-#include<Kripke/Test/TestKernels.h>
-#include<stdio.h>
-#include<string.h>
-#include<algorithm>
-#include<string>
-#include<sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_OPENMP
-#include<omp.h>
-#endif
-
-#ifdef KRIPKE_USE_TCMALLOC
-#include<gperftools/malloc_extension.h>
-#endif
-
-#ifdef KRIPKE_USE_PERFTOOLS
-#include<gperftools/profiler.h>
-#endif
-
-#ifdef __bgq__
-#include </bgsys/drivers/ppcfloor/spi/include/kernel/location.h>
-#include </bgsys/drivers/ppcfloor/spi/include/kernel/memory.h>
-#endif
-
-
-void usage(void){
-  int myid=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-  if(myid == 0){
-    // Get a new object with defaulted values
-    Input_Variables def;
-    
-    // Display command line
-    printf("Usage:  [srun ...] kripke [options...]\n\n");
-    
-    // Display each option
-    printf("Problem Size Options:\n");
-    printf("---------------------\n");
-    
-    printf("  --groups <ngroups>     Number of energy groups\n");
-    printf("                         Default:  --groups %d\n\n", def.num_groups);
-    
-    printf("  --legendre <lorder>    Scattering Legendre Expansion Order (0, 1, ...)\n");
-    printf("                         Default:  --legendre %d\n\n", def.legendre_order);
-    
-    printf("  --quad [<ndirs>|<polar>:<azim>]\n");
-    printf("                         Define the quadrature set to use\n");
-    printf("                         Either a fake S2 with <ndirs> points,\n");
-    printf("                         OR Gauss-Legendre with <polar> by <azim> points\n");
-    printf("                         Default:  --quad %d\n\n", def.num_directions);
-    
-    
-    
-    printf("  --zones <x,y,z>        Number of zones in x,y,z\n");
-    printf("                         Default:  --zones %d,%d,%d\n\n", def.nx, def.ny, def.nz);
-    
-    
-    printf("\n");
-    printf("Physics Parameters:\n");
-    printf("-------------------\n");
-    printf("  --sigt <st0,st1,st2>   Total material cross-sections\n");
-    printf("                         Default:   --sigt %lf,%lf,%lf\n\n", def.sigt[0], def.sigt[1], def.sigt[2]);
-
-    printf("  --sigs <ss0,ss1,ss2>   Scattering material cross-sections\n");
-    printf("                         Default:   --sigs %lf,%lf,%lf\n\n", def.sigs[0], def.sigs[1], def.sigs[2]);
-
-
-    printf("\n");
-    printf("On-Node Options:\n");
-    printf("----------------\n");
-    printf("  --nest <NEST>          Loop nesting order (and data layout)\n");
-    printf("                         Available: DGZ,DZG,GDZ,GZD,ZDG,ZGD\n");
-    printf("                         Default:   --nest %s\n\n", nestingString(def.nesting).c_str());
-    
-    printf("\n");
-    printf("Parallel Decomposition Options:\n");
-    printf("-------------------------------\n");
-    printf("  --layout <lout>        Layout of spatial subdomains over mpi ranks\n");
-    printf("                         0: Blocked: local zone sets are adjacent\n");
-    printf("                         1: Scattered: adjacent zone sets are distributed\n");
-    printf("                         Default: --layout %d\n\n", def.layout_pattern);
-    
-    
-    printf("  --procs <npx,npy,npz>  Number of MPI ranks in each spatial dimension\n");
-    printf("                         Default:  --procs %d,%d,%d\n\n", def.npx, def.npy, def.npz);
-    
-    printf("  --dset <ds>            Number of direction-sets\n");
-    printf("                         Must be a factor of 8, and divide evenly the number\n");
-    printf("                         of quadrature points\n");
-    printf("                         Default:  --dset %d\n\n", def.num_dirsets);
-    
-    printf("  --gset <gs>            Number of energy group-sets\n");
-    printf("                         Must divide evenly the number energy groups\n");
-    printf("                         Default:  --gset %d\n\n", def.num_groupsets);
-    
-    printf("  --zset <zx>,<zy>,<zz>  Number of zone-sets in x,y, and z\n");
-    printf("                         Default:  --zset %d,%d,%d\n\n", def.num_zonesets_dim[0], def.num_zonesets_dim[1], def.num_zonesets_dim[2]);
-    
-    printf("\n");
-    printf("Solver Options:\n");
-    printf("---------------\n");
-    
-    printf("  --niter <NITER>        Number of solver iterations to run\n");
-    printf("                         Default:  --niter %d\n\n", def.niter);
-    
-    printf("  --pmethod <method>     Parallel solver method\n");
-    printf("                         sweep: Full up-wind sweep (wavefront algorithm)\n");
-    printf("                         bj: Block Jacobi\n");
-    printf("                         Default: --pmethod sweep\n\n");
-    
-    printf("\n");
-    printf("Output and Testing Options:\n");
-    printf("---------------------------\n");
-    
-#ifdef KRIPKE_USE_PAPI
-    printf("  --papi <PAPI_X_X,...>  Track PAPI hardware counters for each timer\n\n");
-#endif
-#ifdef KRIPKE_USE_SILO
-    printf("  --silo <BASENAME>      Create SILO output files\n\n");
-#endif
-    printf("  --test                 Run Kernel Test instead of solver\n\n");
-    printf("\n");
-  }
-#ifdef KRIPKE_USE_MPI
-  MPI_Finalize();
-#endif
-  exit(1);
-}
-
-struct CmdLine {
-  CmdLine(int argc, char **argv) :
-    size(argc-1),
-    cur(0),
-    args()
-  {
-    for(int i = 0;i < size;++ i){
-      args.push_back(argv[i+1]);
-    }
-  }
-
-  std::string pop(void){
-    if(atEnd())
-      usage();
-    return args[cur++];
-  }
-
-  bool atEnd(void){
-    return(cur >= size);
-  }
-
-  int size;
-  int cur;
-  std::vector<std::string> args;
-};
-
-std::vector<std::string> split(std::string const &str, char delim){
-  std::vector<std::string> elem;
-  std::stringstream ss(str);
-  std::string e;
-  while(std::getline(ss, e, delim)){
-    elem.push_back(e);
-  }
-  return elem;
-}
-
-
-namespace {
-  template<typename T>
-  std::string toString(T const &val){
-    std::stringstream ss;
-    ss << val;
-    return ss.str();
-  }
-}
-
-int main(int argc, char **argv) {
-  /*
-   * Initialize MPI
-   */
-
-  int myid=0;
-  int num_tasks=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-  MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
-#endif
-
-  if (myid == 0) {
-    /* Print out a banner message along with a version number. */
-    printf("\n");
-    printf("----------------------------------------------------------------------\n");
-    printf("------------------------ KRIPKE VERSION 1.1 --------------------------\n");
-    printf("----------------------------------------------------------------------\n");
-    printf("This work was produced at the Lawrence Livermore National Laboratory\n");
-    printf("(LLNL) under contract no. DE-AC-52-07NA27344 (Contract 44) between the\n");
-    printf("U.S. Department of Energy (DOE) and Lawrence Livermore National\n");
-    printf("Security, LLC (LLNS) for the operation of LLNL. The rights of the\n");
-    printf("Federal Government are reserved under Contract 44.\n");
-    printf("\n");
-    printf("Main Contact: Adam J. Kunen <kunen1@llnl.gov>\n");
-    printf("----------------------------------------------------------------------\n");
-   
-   
-    /* Print out some information about how OpenMP threads are being mapped
-     * to CPU cores.
-     */
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel
-    {
-      int tid = omp_get_thread_num();
-#ifdef __bgq__
-      int core = Kernel_ProcessorCoreID();
-#else
-      int core = sched_getcpu();
-#endif
-      printf("Rank: %d Thread %d: Core %d\n", myid, tid, core);
-    }
-#endif
-  }
-
-  /*
-   * Default input parameters
-   */
-  Input_Variables vars;
-  std::vector<std::string> papi_names;
-  bool test = false;
-  
-  /*
-   * Parse command line
-   */
-  CmdLine cmd(argc, argv);
-  while(!cmd.atEnd()){
-    std::string opt = cmd.pop();
-    if(opt == "-h" || opt == "--help"){usage();}
-    else if(opt == "--name"){vars.run_name = cmd.pop();}
-    else if(opt == "--dset"){
-      vars.num_dirsets = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--gset"){
-      vars.num_groupsets = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--zset"){
-      std::vector<std::string> nz = split(cmd.pop(), ',');
-      if(nz.size() != 3) usage();
-      vars.num_zonesets_dim[0] = std::atoi(nz[0].c_str());
-      vars.num_zonesets_dim[1] = std::atoi(nz[1].c_str());
-      vars.num_zonesets_dim[2] = std::atoi(nz[2].c_str());      
-    }
-    else if(opt == "--layout"){
-      vars.layout_pattern = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--zones"){
-      std::vector<std::string> nz = split(cmd.pop(), ',');
-      if(nz.size() != 3) usage();
-      vars.nx = std::atoi(nz[0].c_str());
-      vars.ny = std::atoi(nz[1].c_str());
-      vars.nz = std::atoi(nz[2].c_str());
-    }
-    else if(opt == "--procs"){
-      std::vector<std::string> np = split(cmd.pop(), ',');
-      if(np.size() != 3) usage();
-      vars.npx = std::atoi(np[0].c_str());
-      vars.npy = std::atoi(np[1].c_str());
-      vars.npz = std::atoi(np[2].c_str());
-    }
-    else if(opt == "--pmethod"){
-      std::string method = cmd.pop();
-      if(!strcasecmp(method.c_str(), "sweep")){
-        vars.parallel_method = PMETHOD_SWEEP;
-      }
-      else if(!strcasecmp(method.c_str(), "bj")){
-        vars.parallel_method = PMETHOD_BJ;
-      }
-      else{
-        usage();
-      }
-    }
-    else if(opt == "--groups"){
-      vars.num_groups = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--quad"){
-      std::vector<std::string> p = split(cmd.pop(), ':');
-      if(p.size() == 1){
-        vars.num_directions = std::atoi(p[0].c_str());
-        vars.quad_num_polar = 0;
-        vars.quad_num_azimuthal = 0;
-      }
-      else if(p.size() == 2){
-        vars.quad_num_polar = std::atoi(p[0].c_str());
-        vars.quad_num_azimuthal = std::atoi(p[1].c_str());
-        vars.num_directions = vars.quad_num_polar * vars.quad_num_azimuthal;
-      }
-      else{
-        usage();
-      }
-    }
-    else if(opt == "--legendre"){
-      vars.legendre_order = std::atoi(cmd.pop().c_str());
-    }
-    else if(opt == "--sigs"){
-      std::vector<std::string> values = split(cmd.pop(), ',');
-      if(values.size()!=3)usage();
-      for(int mat = 0;mat < 3;++ mat){
-        vars.sigs[mat] = std::atof(values[mat].c_str());
-      }
-    }
-    else if(opt == "--sigt"){
-      std::vector<std::string> values = split(cmd.pop(), ',');
-      if(values.size()!=3)usage();
-      for(int mat = 0;mat < 3;++ mat){
-        vars.sigt[mat] = std::atof(values[mat].c_str());
-      }
-    }
-    else if(opt == "--niter"){
-      vars.niter = std::atoi(cmd.pop().c_str());
-    }
-    else if(opt == "--nest"){
-      vars.nesting = nestingFromString(cmd.pop());     
-    }
-#ifdef KRIPKE_USE_SILO
-    else if(opt == "--silo"){
-      vars.silo_basename = cmd.pop();
-    }
-#endif
-    else if(opt == "--test"){
-      test = true;
-    }
-#ifdef KRIPKE_USE_PAPI
-    else if(opt == "--papi"){
-      papi_names = split(cmd.pop(), ',');
-    }
-#endif
-    else{
-      printf("Unknwon options %s\n", opt.c_str());
-      usage();
-    }
-  }
-  
-  // Check that the input arguments are valid
-  if(vars.checkValues()){
-    exit(1);
-  }
-
-  /*
-   * Display Options
-   */
-  if (myid == 0) {
-    printf("Number of MPI tasks:   %d\n", num_tasks);
-#ifdef KRIPKE_USE_OPENMP
-    int num_threads=1;
-#pragma omp parallel
-    {
-      num_threads = omp_get_num_threads();
-      if(omp_get_thread_num() == 0){
-          printf("OpenMP threads/task:   %d\n", num_threads);
-          printf("OpenMP total threads:  %d\n", num_threads*num_tasks);
-        }
-    }
-#endif
-
-#ifdef KRIPKE_USE_PAPI
-    printf("PAPI Counters:         ");
-    if(papi_names.size() > 0){
-      for(int i = 0;i < papi_names.size();++ i){
-        printf("%s ", papi_names[i].c_str());
-      }
-    }
-    else{
-      printf("<none>");
-    }
-    printf("\n");
-#endif
-    printf("Processors:            %d x %d x %d\n", vars.npx, vars.npy, vars.npz);
-    printf("Zones:                 %d x %d x %d\n", vars.nx, vars.ny, vars.nz);
-    printf("Legendre Order:        %d\n", vars.legendre_order);
-    printf("Total X-Sec:           sigt=[%lf, %lf, %lf]\n", vars.sigt[0], vars.sigt[1], vars.sigt[2]);
-    printf("Scattering X-Sec:      sigs=[%lf, %lf, %lf]\n", vars.sigs[0], vars.sigs[1], vars.sigs[2]);
-    printf("Quadrature Set:        ");
-    if(vars.quad_num_polar == 0){
-      printf("Dummy S2 with %d points\n", vars.num_directions);
-    }
-    else {
-      printf("Gauss-Legendre, %d polar, %d azimuthal (%d points)\n", vars.quad_num_polar, vars.quad_num_azimuthal, vars.num_directions);
-    }
-    printf("Parallel method:       ");
-    if(vars.parallel_method == PMETHOD_SWEEP){
-      printf("Sweep\n");
-    }
-    else if(vars.parallel_method == PMETHOD_BJ){
-      printf("Block Jacobi\n");
-    }
-    printf("Loop Nesting Order     %s\n", nestingString(vars.nesting).c_str());        
-    printf("Number iterations:     %d\n", vars.niter);
-    
-    printf("GroupSet/Groups:       %d sets, %d groups/set\n", vars.num_groupsets, vars.num_groups/vars.num_groupsets);
-    printf("DirSets/Directions:    %d sets, %d directions/set\n", vars.num_dirsets, vars.num_directions/vars.num_dirsets);
-
-    printf("Zone Sets:             %d,%d,%d\n", vars.num_zonesets_dim[0], vars.num_zonesets_dim[1], vars.num_zonesets_dim[2]);
-
-    
-  }
-
-#ifdef KRIPKE_USE_PERFTOOLS
-  ProfilerStart("kripke.prof");
-#endif  
-
-  if(test){
-    // Invoke Kernel testing
-    testKernels(vars);
-  }
-  else{
-    // Allocate problem 
-    Grid_Data *grid_data = new Grid_Data(&vars);
-
-    grid_data->timing.setPapiEvents(papi_names);
-
-    // Run the solver
-    SweepSolver(grid_data, vars.parallel_method == PMETHOD_BJ);
-
-#ifdef KRIPKE_USE_SILO
-    // Output silo data
-    if(vars.silo_basename != ""){
-      grid_data->writeSilo(vars.silo_basename);
-    }
-#endif
-
-    // Print Timing Info
-    int myid=0;
-#ifdef KRIPKE_USE_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-    if(myid == 0){
-      grid_data->timing.print();
-      printf("\n\n");
-    }
-
-    // Cleanup 
-    delete grid_data;
-  }
-  
-#ifdef KRIPKE_USE_PERFTOOLS
-  ProfilerStop();
-#endif  
-
-  // Gather post-point memory info
-  double heap_mb = -1.0;
-  double hwm_mb = -1.0;
-#ifdef KRIPKE_USE_TCMALLOC
-  // If we are using tcmalloc, we need to use it's interface
-  MallocExtension *mext = MallocExtension::instance();
-  size_t bytes;
-
-  mext->GetNumericProperty("generic.current_allocated_bytes", &bytes);
-  heap_mb = ((double)bytes)/1024.0/1024.0;
-
-  mext->GetNumericProperty("generic.heap_size", &bytes);
-  hwm_mb = ((double)bytes)/1024.0/1024.0;
-#else
-#ifdef __bgq__
-  // use BG/Q specific calls (if NOT using tcmalloc)
-  uint64_t bytes;
-
-  int rc = Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &bytes);
-  heap_mb = ((double)bytes)/1024.0/1024.0;
-
-  rc = Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAPMAX, &bytes);
-  hwm_mb = ((double)bytes)/1024.0/1024.0;
-#endif
-#endif
-  // Print memory info
-  if(myid == 0 && heap_mb >= 0.0){
-    printf("Bytes allocated: %lf MB\n", heap_mb);
-    printf("Heap Size      : %lf MB\n", hwm_mb);
-
-  }
-  
-  // Cleanup and exit
-#ifdef KRIPKE_USE_MPI
-  MPI_Finalize();
-#endif
-
-  return (0);
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.h
deleted file mode 100644
index 595e009e3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_H__
-#define KRIPKE_H__
-
-#include<string>
-#include<vector>
-#include<stdio.h>
-#include<cmath>
-#include<strings.h>
-
-
-// Stubs for building without MPI
-#ifdef KRIPKE_USE_MPI
-
-#include<mpi.h>
-
-#define KripkeAbort(...) {printf(__VA_ARGS__); MPI_Abort(MPI_COMM_WORLD, 1);}
-
-#else
-
-
-#define KripkeAbort(...) {printf(__VA_ARGS__); exit(1);}
-
-#endif
-
-
-// Adopt RAJA's use of OPENMP
-#include<RAJA/RAJA.hxx>
-#ifndef KRIPKE_USE_OPENMP
-#ifdef RAJA_ENABLE_OPENMP
-#define KRIPKE_USE_OPENMP
-#endif
-#endif
-
-// Make sure that there's openmp support, otherwise error out
-#ifdef KRIPKE_USE_OPENMP
-#ifndef _OPENMP
-#error "OpenMP selected for build, but OpenMP is not available"
-#endif
-#endif
-
-// Forward Decl
-struct Grid_Data;
-
-#define KRESTRICT __restrict__
-
-
-// In Kripke/Sweep_Solver.cpp
-int SweepSolver(Grid_Data *grid_data, bool block_jacobi);
-void SweepSubdomains (std::vector<int> subdomain_list, Grid_Data *grid_data, bool block_jacobi);
-
-/**
- * Tags for choosing which data nesting to be chosen
- */
-enum Nesting_Order {
-  // Nestings for Psi and Phi
-  // D referes to directions OR moments, depending on context
-  NEST_DGZ,
-  NEST_DZG,
-  NEST_GDZ,
-  NEST_GZD,
-  NEST_ZDG,
-  NEST_ZGD
-};
-
-/** 
- *Tags corresponding Nesting_Order's, as types
- */
-struct NEST_DGZ_T {};
-struct NEST_DZG_T {};
-struct NEST_GDZ_T {};
-struct NEST_GZD_T {};
-struct NEST_ZDG_T {};
-struct NEST_ZGD_T {};
-
-
-
-
-/**
-  Tags for which parallel algorithm to use.
-*/
-enum ParallelMethod {
-  PMETHOD_SWEEP,
-  PMETHOD_BJ
-};
-
-/**
- * Converts a nesting tag to a human-readable string.
- */
-inline std::string nestingString(Nesting_Order nesting){
-  switch(nesting){
-    case NEST_DGZ: return("DGZ");
-    case NEST_DZG: return("DZG");
-    case NEST_GDZ: return("GDZ");
-    case NEST_GZD: return("GZD");
-    case NEST_ZDG: return("ZDG");
-    case NEST_ZGD: return("ZGD");
-  }
-  return("UNKNOWN");
-}
-
-/**
- * Converts a string (eg. from command line) to a nesting tag.
- */
-inline Nesting_Order nestingFromString(std::string const &str){
-  for(int i = 0;i < 6;++ i){
-    if(!strcasecmp(str.c_str(), nestingString((Nesting_Order)i).c_str())){
-      return (Nesting_Order)i;
-  }
- }
-  return (Nesting_Order)-1;
-}
-
-
-/**
- * Compares two vectors for differences.
- * Used in testing suite.
- */
-inline bool compareVector(std::string const &name,
-    std::vector<double> const &a,
-    std::vector<double> const &b, double tol, bool verbose){
-
-  if(a.size() != b.size()){
-    if(verbose){
-      printf("Vectors are different lengths: %ld, %ld\n",
-          (long)a.size(), (long)b.size());
-    }
-    return true;
-  }
-
-  bool is_diff = false;
-  for(size_t i = 0;i < a.size();++i){
-    if(std::abs(a[i]-b[i]) > tol){
-      is_diff = true;
-      if(verbose){
-        printf("%s[%d]:%e, %e [%e]\n",
-            name.c_str(), (int)i,
-            a[i], b[i], std::abs(a[i]-b[i]));
-        is_diff = true;
-      }
-      else{
-        break;
-      }
-    }
-  }
-
-  return is_diff;
-}
-
-/**
- * Compares two scalars for differences.
- * Used in testing suite.
- */
-inline bool compareScalar(std::string const &name,
-    double a, double b, double tol, bool verbose){
-
-  if(std::abs(a-b) > tol){
-    if(verbose){
-      printf("%s:%e, %e [%e]\n",
-          name.c_str(),
-          a, b, std::abs(a-b));
-    }
-    return true;
-  }
-  return false;
-}
-
-
-#include<KripkeRAJA.h>
-
-#endif
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/CMakeLists.txt b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/CMakeLists.txt
deleted file mode 100644
index cc2504298..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-include_directories(..)
-
-add_definitions(-DRAJA_ENABLE_NESTED)
-
-set(KRIPKE_SRC 
-    Directions.cpp  
-    Grid.cpp  
-    Input_Variables.cpp
-    Kernel.cpp
-    Layout.cpp
-    Subdomain.cpp  
-    Sweep_Solver.cpp
-    ParallelComm.cpp  
-    Timing.cpp
-      
-    Kernel/Kernel_3d_GDZ.cpp
-    Kernel/Kernel_3d_DGZ.cpp
-    Kernel/Kernel_3d_ZDG.cpp
-    Kernel/Kernel_3d_DZG.cpp
-    Kernel/Kernel_3d_ZGD.cpp
-    Kernel/Kernel_3d_GZD.cpp
-    
-    ParallelComm/BlockJacobiComm.cpp
-    ParallelComm/SweepComm.cpp 
-    
-    Test/TestKernels.cpp)
-
-if(RAJA_ENABLE_CUDA)
-  cuda_add_library(lib_kripke_raja ${KRIPKE_SRC})  
-else()
-  add_library(lib_kripke_raja ${KRIPKE_SRC})  
-endif()  
-                                                                                                          
-list(APPEND KRIPKE_LIBS lib_kripke_raja) 
-
-
-# Propagate to parent directory
-set(KRIPKE_LIBS ${KRIPKE_LIBS} PARENT_SCOPE) 
- 
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/DView.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/DView.h
deleted file mode 100644
index d419e0f17..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/DView.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef __DOMAIN_TVIEW_H__
-#define __DOMAIN_TVIEW_H__
-
-#include<string>
-#include<Kripke.h>
-
-
-template<typename IdxLin, typename Perm, typename ... Idxs>
-struct DLayout : public RAJA::Layout<IdxLin, Perm, Idxs...>{
-
-
-  
-  inline DLayout(Grid_Data &domain, int sdom_id) :
-    RAJA::Layout<IdxLin, Perm, Idxs...>(domain.indexSize<Idxs>(sdom_id)...)
-  {
-  }
-
-  template<typename ... ARGS>
-  RAJA_HOST_DEVICE
-  inline DLayout(ARGS ... args) :
-    RAJA::Layout<IdxLin, Perm, Idxs...>(args...)
-  {}
-
-};
-
-
-template<typename DataType, typename L>
-struct DView {};
-
-template<typename DataType, typename IdxLin, typename Perm, typename ... Idxs>
-struct DView<DataType, DLayout<IdxLin, Perm, Idxs...>> : public RAJA::View<DataType, DLayout<IdxLin, Perm, Idxs...>> {
-
-  inline DView(Grid_Data &domain, int sdom_id, DataType *ptr) :
-    RAJA::View<DataType, DLayout<int, Perm, Idxs...>>(
-        ptr,
-        domain.indexSize<Idxs>(sdom_id)...)
-  {}
-};
-
-#if 0
-
-template<typename POL, typename IdxI, typename R, typename BODY>
-RAJA_INLINE
-void dForallN_expanded(Grid_Data &domain, int sdom_id, BODY const &body, R (BODY::*mf)(IdxI) const){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI>(seg_i, body);
-}
-
-template<typename POL, typename IdxI, typename IdxJ, typename R, typename BODY>
-RAJA_INLINE
-void dForallN_expanded(Grid_Data &domain, int sdom_id, BODY const &body, R (BODY::*mf)(IdxI, IdxJ) const){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ>(seg_i, seg_j, body);
-}
-
-
-
-template<typename POL, typename IdxI, typename IdxJ, typename IdxK, typename R, typename BODY>
-RAJA_INLINE
-void dForallN_expanded(Grid_Data &domain, int sdom_id, BODY const &body, R (BODY::*mf)(IdxI, IdxJ, IdxK) const){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-  RAJA::RangeSegment seg_k = domain.indexRange<IdxK>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ, IdxK>(seg_i, seg_j, seg_k, body);
-}
-
-
-template<typename POL, typename IdxI, typename IdxJ, typename IdxK, typename IdxL, typename R, typename BODY>
-RAJA_INLINE
-void dForallN_expanded(Grid_Data &domain, int sdom_id, BODY const &body, R (BODY::*mf)(IdxI, IdxJ, IdxK, IdxL) const){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-  RAJA::RangeSegment seg_k = domain.indexRange<IdxK>(sdom_id);
-  RAJA::RangeSegment seg_l = domain.indexRange<IdxL>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ, IdxK, IdxL>(seg_i, seg_j, seg_k, seg_l, body);
-}
-
-
-template<typename POLICY, typename BODY>
-RAJA_INLINE 
-void dForallN(Grid_Data &domain, int sdom_id, BODY body){
-  dForallN_expanded<POLICY>(domain, sdom_id, body, &BODY::operator());
-}
-
-#else
-
-
-#endif
-
-
-
-template<typename POL, typename IdxI, typename BODY>
-RAJA_INLINE
-void dForallN(Grid_Data &domain, int sdom_id, BODY const &body){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI>(seg_i, body);
-}
-
-template<typename POL, typename IdxI, typename IdxJ, typename BODY>
-RAJA_INLINE
-void dForallN(Grid_Data &domain, int sdom_id, BODY const &body){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ>(seg_i, seg_j, body);
-}
-
-
-
-template<typename POL, typename IdxI, typename IdxJ, typename IdxK, typename BODY>
-RAJA_INLINE
-void dForallN(Grid_Data &domain, int sdom_id, BODY const &body){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-  RAJA::RangeSegment seg_k = domain.indexRange<IdxK>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ, IdxK>(seg_i, seg_j, seg_k, body);
-}
-
-
-template<typename POL, typename IdxI, typename IdxJ, typename IdxK, typename IdxL, typename BODY>
-RAJA_INLINE
-void dForallN(Grid_Data &domain, int sdom_id, BODY const &body){
-
-  RAJA::RangeSegment seg_i = domain.indexRange<IdxI>(sdom_id);
-  RAJA::RangeSegment seg_j = domain.indexRange<IdxJ>(sdom_id);
-  RAJA::RangeSegment seg_k = domain.indexRange<IdxK>(sdom_id);
-  RAJA::RangeSegment seg_l = domain.indexRange<IdxL>(sdom_id);
-
-  // Call underlying forall, extracting ranges from domain
-  RAJA::forallN<POL, IdxI, IdxJ, IdxK, IdxL>(seg_i, seg_j, seg_k, seg_l, body);
-}
-
-
-
-#endif
-
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.cpp
deleted file mode 100644
index a68e1a3ea..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Directions.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Input_Variables.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <float.h>
-#include <algorithm>
-
-namespace {
-  /*
-    GaussLegendre returns the n point Gauss-Legendre quadrature rule for
-    the integral between x1 and x2.
-  */
-  void GaussLegendre(double x1, double x2, std::vector<double> &x,
-      std::vector<double> &w, double eps)
-  {
-    int n = x.size();
-    int m, j, i;
-    double z1, z, xm, xl, pp, p3, p2, p1;
-
-    m=(n+1)/2;
-    xm=0.5*(x2+x1);
-    xl=0.5*(x2-x1);
-    for(i=1; i<=m; i++){
-      z=cos(M_PI*(i-0.25)/(n+0.5));
-      do {
-        p1=1.0;
-        p2=0.0;
-        for(j=1; j<=n; j++){
-          p3=p2;
-          p2=p1;
-          p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
-        }
-        pp=n*(z*p1-p2)/(z*z-1.0);
-        z1=z;
-        z=z1-p1/pp;
-      } while(fabs(z-z1) > eps);
-      x[i-1]=xm-xl*z;
-      x[n-i]=xm+xl*z;
-      w[i-1]=2.0*xl/((1.0-z*z)*pp*pp);
-
-      w[n-i]=w[i-1];
-    }
-  }
-
-
-  bool dirSortFcn(Directions const &a, Directions const &b){
-    return b.octant < a.octant;
-  }
-}
-
-/**
- * Initializes the quadrature set information for a Grid_Data object.
- * This guarantees that each <GS,DS> pair have a single originating octant.
- */
-void InitDirections(Grid_Data *grid_data, Input_Variables *input_vars)
-{
-  std::vector<Directions> &directions = grid_data->directions;
-
-  // Get set description from user
-  int num_directions_per_octant = input_vars->num_directions/8;
-  int num_directions = input_vars->num_directions;
-
-  // allocate storage
-  directions.resize(num_directions);
-
-  // Are we running a REAL quadrature set?
-  int num_polar = input_vars->quad_num_polar;
-  int num_azimuth = input_vars->quad_num_azimuthal;
-
-  std::vector<double> polar_cos;
-  std::vector<double> polar_weight;
-  if(num_polar > 0){
-    // make sure the user specified the correct number of quadrature points
-    if(num_polar % 4 != 0){
-      KripkeAbort("Must have number of polar angles be a multiple of 4\n");
-    }
-    if(num_azimuth % 2 != 0){
-      KripkeAbort("Must have number of azimuthal angles be a multiple of 2\n");
-    }
-    if(num_polar*num_azimuth != num_directions){
-      KripkeAbort("You need to specify %d total directions, not %d\n",
-          num_polar*num_azimuth, num_directions);
-    }
-
-    // Compute gauss legendre weights
-    polar_cos.resize(num_polar);
-    polar_weight.resize(num_polar);
-    GaussLegendre(-1.0, 1.0, polar_cos, polar_weight, DBL_EPSILON);
-
-    // compute azmuhtal angles and weights
-    std::vector<double> az_angle(num_azimuth);
-    std::vector<double> az_weight(num_azimuth);
-    double dangle = 2.0*M_PI/((double) num_azimuth);
-
-    for(int i=0; i<num_azimuth; i++){
-      if(i == 0){
-        az_angle[0] = dangle/2.0;
-      }
-      else{
-        az_angle[i] = az_angle[i-1] + dangle;
-      }
-      az_weight[i] = dangle;
-    }
-
-
-    // Loop over polar 'octants
-    int d = 0;
-    for(int i=0; i< num_polar; i++){
-      for(int j=0; j< num_azimuth; j++){
-        double xcos = sqrt(1.0-polar_cos[i]*polar_cos[i]) * cos(az_angle[j]);
-        double ycos = sqrt(1.0-polar_cos[i]*polar_cos[i]) * sin(az_angle[j]);
-        double zcos = polar_cos[i];
-        double w = polar_weight[i]*az_weight[j];
-
-        directions[d].id = (xcos > 0.) ? 1 : -1;
-        directions[d].jd = (ycos > 0.) ? 1 : -1;
-        directions[d].kd = (zcos > 0.) ? 1 : -1;
-
-        directions[d].octant = 0;
-        if(directions[d].id == -1){
-          directions[d].octant += 1;
-        }
-        if(directions[d].jd == -1){
-          directions[d].octant += 2;
-        }
-        if(directions[d].kd == -1){
-          directions[d].octant += 4;
-        }
-
-        directions[d].xcos = std::abs(xcos);
-        directions[d].ycos = std::abs(ycos);
-        directions[d].zcos = std::abs(zcos);
-        directions[d].w = w;
-
-        ++ d;
-      }
-    }
-
-    // Sort by octant.. so each set has same directions
-    std::sort(directions.begin(), directions.end(), dirSortFcn);
-  }
-  else{
-    // Do (essentialy) an S2 quadrature.. but with repeated directions
-
-    // Compute x,y,z cosine values
-    double mu  = cos(M_PI/4);
-    double eta = sqrt(1-mu*mu) * cos(M_PI/4);
-    double xi  = sqrt(1-mu*mu) * sin(M_PI/4);
-    int d = 0;
-    for(int octant = 0;octant < 8;++ octant){
-      double omegas[3];
-      omegas[0] = octant & 0x1;
-      omegas[1] = (octant>>1) & 0x1;
-      omegas[2] = (octant>>2) & 0x1;
-
-      for(int sd=0; sd<num_directions_per_octant; sd++, d++){
-        // Store which logical direction of travel we have
-        directions[d].id = (omegas[0] > 0.) ? 1 : -1;
-        directions[d].jd = (omegas[1] > 0.) ? 1 : -1;
-        directions[d].kd = (omegas[2] > 0.) ? 1 : -1;
-
-        // Store quadrature point's weight
-        directions[d].w = 4.0*M_PI / (double)num_directions;
-        directions[d].xcos = mu;
-        directions[d].ycos = eta;
-        directions[d].zcos = xi;
-      }
-    }
-  }
-}
-
-
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.h
deleted file mode 100644
index b0e228ad9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Directions.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_DIRECTIONS_H__
-#define KRIPKE_DIRECTIONS_H__
-
-#include <vector>
-
-struct Grid_Data;
-struct Input_Variables;
-
-/**
- * Contains information needed for one quadrature set direction.
- */
-struct Directions{
-  double xcos;              /* Absolute value of the x-direction cosine. */
-  double ycos;              /* Absolute value of the y-direction cosine. */
-  double zcos;              /* Absolute value of the z-direction cosine. */
-  double w;                 /* weight for the quadrature rule.*/
-  int id;                   /* direction flag (= 1 if x-direction
-                            cosine is positive; = -1 if not). */
-  int jd;                   /* direction flag (= 1 if y-direction
-                            cosine is positive; = -1 if not). */
-  int kd;                   /* direction flag (= 1 if z-direction
-                            cosine is positive; = -1 if not). */
-  int octant;
-};
-
-
-void InitDirections(Grid_Data *grid_data, Input_Variables *input_vars);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.cpp
deleted file mode 100644
index f336930c3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.cpp
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-
-#include <Kripke/Input_Variables.h>
-#include <Kripke/Layout.h>
-#include <Kripke/SubTVec.h>
-#include <cmath>
-#include <sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include <mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_SILO
-#include <sys/stat.h>
-#include <silo.h>
-#include <string.h>
-#endif
-
-/**
- * Grid_Data constructor
-*/
-Grid_Data::Grid_Data(Input_Variables *input_vars)
-{
-  // Create object to describe processor and subdomain layout in space
-  // and their adjacencies
-  Layout *layout = createLayout(input_vars);
-
-  // create the kernel object based on nesting
-  kernel = createKernel(input_vars->nesting, 3);
-
-  // Create quadrature set (for all directions)
-  int total_num_directions = input_vars->num_directions;
-  InitDirections(this, input_vars);
-
-  num_direction_sets = input_vars->num_dirsets;
-  num_directions_per_set = total_num_directions/num_direction_sets;
-  num_group_sets = input_vars->num_groupsets;
-  num_groups_per_set = input_vars->num_groups/ num_group_sets;
-  num_zone_sets = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    num_zone_sets *= input_vars->num_zonesets_dim[dim];
-  }
-
-  legendre_order = input_vars->legendre_order;
-  total_num_moments = (legendre_order+1)*(legendre_order+1);
-
-  int num_subdomains = num_direction_sets*num_group_sets*num_zone_sets;
-
-  Nesting_Order nest = input_vars->nesting;
-
-  /* Set ncalls */
-  niter = input_vars->niter;
-
-  // setup mapping of moments to legendre coefficients
-  moment_to_coeff.resize(total_num_moments);
-  int nm = 0;
-  for(int n = 0;n < legendre_order+1;++ n){
-    for(int m = -n;m <= n; ++ m){
-      moment_to_coeff[nm] = n;
-      ++ nm;
-    }
-  }
-
-  // setup cross-sections
-  int total_num_groups = num_group_sets*num_groups_per_set;
-  sigma_tot.resize(total_num_groups, 0.0);
-
-  // Setup scattering transfer matrix for 3 materials  
-
-  sigs = new SubTVec(kernel->nestingSigs(), total_num_groups*total_num_groups, legendre_order+1, 3);
-
-  // Set to isotropic scattering given user inputs
-  sigs->clear(0.0);
-  for(int mat = 0;mat < 3;++ mat){
-    for(int g = 0;g < total_num_groups;++ g){
-      int idx_g_gp = g*total_num_groups + g;
-      (*sigs)(idx_g_gp, 0, mat) = input_vars->sigs[mat];
-    }
-  }
-
-  // just allocate pointer vectors, we will allocate them below
-  ell.resize(num_direction_sets, NULL);
-  ell_plus.resize(num_direction_sets, NULL);
-  phi.resize(num_zone_sets, NULL);
-  phi_out.resize(num_zone_sets, NULL);
-
-  // Initialize Subdomains
-  zs_to_sdomid.resize(num_zone_sets);
-  subdomains.resize(num_subdomains);
-  for(int gs = 0;gs < num_group_sets;++ gs){
-    for(int ds = 0;ds < num_direction_sets;++ ds){
-      for(int zs = 0;zs < num_zone_sets;++ zs){
-        // Compupte subdomain id
-        int sdom_id = layout->setIdToSubdomainId(gs, ds, zs);
-
-        // Setup the subdomain
-        Subdomain &sdom = subdomains[sdom_id];
-        sdom.setup(sdom_id, input_vars, gs, ds, zs, directions, kernel, layout);
-
-        // Create ell and ell_plus, if this is the first of this ds
-        bool compute_ell = false;
-        if(ell[ds] == NULL){
-          ell[ds] = new SubTVec(kernel->nestingEll(), total_num_moments, sdom.num_directions, 1);
-          ell_plus[ds] = new SubTVec(kernel->nestingEllPlus(), total_num_moments, sdom.num_directions, 1);
-
-          compute_ell = true;
-        }
-
-        // Create phi and phi_out, if this is the first of this zs
-        if(phi[zs] == NULL){
-          phi[zs] = new SubTVec(nest, total_num_groups, total_num_moments, sdom.num_zones);
-          phi_out[zs] = new SubTVec(nest, total_num_groups, total_num_moments, sdom.num_zones);
-        }
-
-        // setup zs to sdom mapping
-        if(gs == 0 && ds == 0){
-          zs_to_sdomid[zs] = sdom_id;
-        }
-
-        // Set the variables for this subdomain
-        sdom.setVars(ell[ds], ell_plus[ds], phi[zs], phi_out[zs]);
-
-        if(compute_ell){
-          // Compute the L and L+ matrices
-          sdom.computeLLPlus(legendre_order);
-        }
-      }
-    }
-  }
-  delete layout;
-
-
-
-  // Now compute number of elements allocated globally,
-  // and get each materials volume
-  long long vec_size[4] = {0,0,0,0};
-  double vec_volume[3] = {0.0, 0.0, 0.0};
-  for(int sdom_id = 0;sdom_id < subdomains.size();++sdom_id){
-    Subdomain &sdom = subdomains[sdom_id];
-    vec_size[0] += sdom.psi->elements;
-    vec_size[1] += sdom.psi->elements;
-  }
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    vec_size[2] += phi[zs]->elements;
-    vec_size[3] += phi_out[zs]->elements;
-    int sdom_id = zs_to_sdomid[zs];
-    for(int mat = 0;mat < 3;++ mat){
-      vec_volume[mat] += subdomains[sdom_id].reg_volume[mat];
-    }
-  }
-
-
-#ifdef KRIPKE_USE_MPI
-  int mpi_rank;
-  double global_volume[3];
-  long long global_size[4];
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Reduce(vec_size, global_size, 4, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(vec_volume, global_volume, 3, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-#else
-  int mpi_rank = 0;
-  long long *global_size = vec_size;
-  double *global_volume = vec_volume;
-#endif
-
-  if(mpi_rank == 0){
-    printf("Unknown counts: psi=%ld, rhs=%ld, phi=%ld, phi_out=%ld\n",
-      (long)global_size[0], (long)global_size[1], (long)global_size[2], (long)global_size[3]);
-    printf("Region volumes: Reg1=%e, Reg2=%e, Reg3=%e\n",
-        global_volume[0], global_volume[1], global_volume[2]);
-  }
-}
-
-Grid_Data::~Grid_Data(){
-  delete kernel;
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    delete phi[zs];
-    delete phi_out[zs];
-  }
-  for(int ds = 0;ds < num_direction_sets;++ ds){
-    delete ell[ds];
-    delete ell_plus[ds];
-  }
-  delete sigs;
-}
-
-/**
- * Randomizes all variables and matrices for testing suite.
- */
-void Grid_Data::randomizeData(void){
-  for(int i = 0;i < sigma_tot.size();++i){
-    sigma_tot[i] = drand48();
-  }
-
-  for(int i = 0;i < directions.size();++i){
-    directions[i].xcos = drand48();
-    directions[i].ycos = drand48();
-    directions[i].zcos = drand48();
-  }
-
-
-  for(int s = 0;s < subdomains.size();++ s){
-    subdomains[s].randomizeData();
-  }
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    phi[zs]->randomizeData();
-    phi_out[zs]->randomizeData();
-  }
-
-  for(int ds = 0;ds < num_direction_sets;++ ds){
-    ell[ds]->randomizeData();
-    ell_plus[ds]->randomizeData();
-  }
-
-  sigs->randomizeData();
-}
-
-
-/**
- * Returns the integral of psi.. to look at convergence
- */
-double Grid_Data::particleEdit(void){
-  // sum up particles for psi and rhs
-  double part = 0.0;
-  for(int sdom_id = 0;sdom_id < subdomains.size();++ sdom_id){
-    Subdomain &sdom = subdomains[sdom_id];
-
-    int num_zones = sdom.num_zones;
-    int num_directions = sdom.num_directions;
-    int num_groups= sdom.num_groups;
-    Directions *dirs = sdom.directions;
-
-    for(int z = 0;z < num_zones;++ z){
-      double vol = sdom.volume[z];
-      for(int d = 0;d < num_directions;++ d){
-        double w = dirs[d].w;
-        for(int g = 0;g < num_groups;++ g){
-          part += w * (*sdom.psi)(g,d,z) * vol;
-        }
-      }
-    }
-  }
-
-  // reduce
-#ifdef KRIPKE_USE_MPI
-  double part_global;
-
-  MPI_Reduce(&part, &part_global, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-
-  return part_global;
-#else
-
-  return part;
-
-#endif
-}
-
-
-/**
- * Copies all variables and matrices for testing suite.
- * Correctly copies data from one nesting to another.
- */
-void Grid_Data::copy(Grid_Data const &b){
-  sigma_tot = b.sigma_tot;
-  directions = b.directions;
-
-  subdomains.resize(b.subdomains.size());
-  for(int s = 0;s < subdomains.size();++ s){
-    subdomains[s].copy(b.subdomains[s]);
-  }
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    phi[zs]->copy(*b.phi[zs]);
-    phi_out[zs]->copy(*b.phi_out[zs]);
-  }
-
-  for(int ds = 0;ds < ell.size();++ ds){
-    ell[ds]->copy(*b.ell[ds]);
-    ell_plus[ds]->copy(*b.ell_plus[ds]);
-  }
-
-  sigs->copy(*b.sigs);
-}
-
-/**
- * Compares all variables and matrices for testing suite.
- * Correctly compares data from one nesting to another.
- */
-bool Grid_Data::compare(Grid_Data const &b, double tol, bool verbose){
-  bool is_diff = false;
-
-  for(int i = 0;i < directions.size();++i){
-    std::stringstream dirname;
-    dirname << "directions[" << i << "]";
-
-    is_diff |= compareScalar(dirname.str()+".xcos",
-        directions[i].xcos, b.directions[i].xcos, tol, verbose);
-
-    is_diff |= compareScalar(dirname.str()+".ycos",
-        directions[i].ycos, b.directions[i].ycos, tol, verbose);
-
-    is_diff |= compareScalar(dirname.str()+".zcos",
-        directions[i].zcos, b.directions[i].zcos, tol, verbose);
-  }
-
-  for(int s = 0;s < subdomains.size();++ s){
-    is_diff |= subdomains[s].compare(
-        b.subdomains[s], tol, verbose);
-
-  }
-  is_diff |= compareVector("sigma_tot", sigma_tot, b.sigma_tot, tol, verbose);
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    is_diff |= phi[zs]->compare("phi", *b.phi[zs], tol, verbose);
-    is_diff |= phi_out[zs]->compare("phi_out", *b.phi_out[zs], tol, verbose);
-  }
-
-  for(int ds = 0;ds < ell.size();++ ds){
-    is_diff |= ell[ds]->compare("ell", *b.ell[ds], tol, verbose);
-    is_diff |= ell_plus[ds]->compare("ell_plus", *b.ell_plus[ds], tol, verbose);
-  }
-
-  is_diff |= sigs->compare("sigs", *b.sigs, tol, verbose);
-
-  return is_diff;
-}
-
-
-#ifdef KRIPKE_USE_SILO
-
-enum MultivarType {
-  MULTI_MESH,
-  MULTI_MAT,
-  MULTI_VAR
-};
-
-namespace {
-  /**
-    Writes a multimesh or multivar to the root file.
-  */
-
-  void siloWriteMulti(DBfile *root, MultivarType mv_type,
-    std::string const &fname_base, std::string const &var_name,
-    std::vector<int> sdom_id_list, int var_type = 0)
-  {
-    int mpi_size;
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-    int num_sdom = sdom_id_list.size();
-
-    // setup names and types
-    std::vector<int> var_types(mpi_size*num_sdom, var_type);
-    std::vector<char *> var_names(mpi_size*num_sdom);
-    int var_idx = 0;
-    for(int rank = 0;rank < mpi_size;++ rank){
-      for(int idx = 0;idx < num_sdom;++ idx){
-        int sdom_id = sdom_id_list[idx];
-        std::stringstream name;
-        name << fname_base << "/rank_" << rank << ".silo:/sdom" << sdom_id << "/" << var_name;
-        var_names[var_idx] = strdup(name.str().c_str());
-        var_idx ++;
-      }
-    }
-
-    if(mv_type == MULTI_MESH){
-      DBPutMultimesh(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0], &var_types[0], NULL);
-    }
-    else if(mv_type == MULTI_MAT){
-      DBPutMultimat(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0],  NULL);
-    }
-    else{
-      DBPutMultivar(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0],  &var_types[0] , NULL);
-    }
-
-    // cleanup
-    for(int i = 0;i < mpi_size*num_sdom; ++i){
-      free(var_names[i]);
-    }
-  }
-
-  void siloWriteRectMesh(DBfile *silo_file,
-    std::string const &mesh_name,
-    int const *nzones,
-    double const *zeros,
-    double const *deltas_x,
-    double const *deltas_y,
-    double const *deltas_z)
-  {
-    static char const *coordnames[3] = {"X", "Y", "Z"};
-    double const *deltas[3] = {deltas_x, deltas_y, deltas_z};
-    double *coords[3];
-    for(int dim = 0;dim < 3;++ dim){
-      coords[dim] = new double[nzones[dim]];
-      coords[dim][0] = zeros[dim];
-      for(int z = 0;z < nzones[dim];++ z){
-        coords[dim][1+z] = coords[dim][z] + deltas[dim][z];
-      }
-    }
-    int nnodes[3] = {
-      nzones[0]+1,
-      nzones[1]+1,
-      nzones[2]+1
-    };
-
-    DBPutQuadmesh(silo_file, mesh_name.c_str(), const_cast<char**>(coordnames), coords, nnodes, 3, DB_DOUBLE,
-        DB_COLLINEAR, NULL);
-
-    // cleanup
-    delete[] coords[0];
-    delete[] coords[1];
-    delete[] coords[2];
-  }
-
-
-} //namespace
-
-
-void Grid_Data::writeSilo(std::string const &fname_base){
-
-  // Recompute Phi... so we can write out phi0
-  kernel->LTimes(this);
-
-  int mpi_rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
-  if(mpi_rank == 0){
-    // Create a root file
-    std::string fname_root = fname_base + ".silo";
-    DBfile *root = DBCreate(fname_root.c_str(),
-        DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
-
-    // Write out multimesh and multivars
-    siloWriteMulti(root, MULTI_MESH, fname_base, "mesh", zs_to_sdomid, DB_QUAD_RECT);
-    siloWriteMulti(root, MULTI_MAT, fname_base, "material", zs_to_sdomid);
-    siloWriteMulti(root, MULTI_VAR, fname_base, "phi0", zs_to_sdomid, DB_QUADVAR);
-
-    // Close root file
-    DBClose(root);
-
-    // Create a subdirectory to hold processor info
-    mkdir(fname_base.c_str(), 0750);
-  }
-
-  // Sync up, so everyone sees the subdirectory
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // Create our processor file
-  std::stringstream ss_proc;
-  ss_proc << fname_base << "/rank_" << mpi_rank << ".silo";
-  DBfile *proc = DBCreate(ss_proc.str().c_str(),
-      DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
-
-  // Write out data for each subdomain
-  int num_zone_sets = zs_to_sdomid.size();
-  for(int sdom_idx = 0;sdom_idx < num_zone_sets;++ sdom_idx){
-    int sdom_id = zs_to_sdomid[sdom_idx];
-    Subdomain &sdom = subdomains[sdom_id];
-
-    // Create a directory for the subdomain
-    std::stringstream dirname;
-    dirname << "/sdom" << sdom_id;
-    DBMkDir(proc, dirname.str().c_str());
-
-    // Set working directory
-    DBSetDir(proc, dirname.str().c_str());
-
-    // Write the mesh
-    siloWriteRectMesh(proc, "mesh", sdom.nzones, sdom.zeros,
-      &sdom.deltas[0][1], &sdom.deltas[1][1], &sdom.deltas[2][1]);
-
-
-    // Write the material
-    {
-      int num_zones = sdom.num_zones;
-      int num_mixed = sdom.mixed_material.size();
-      int matnos[3] = {1, 2, 3};
-      std::vector<int> matlist(num_zones, 0);
-      std::vector<int> mix_next(num_mixed, 0);
-      std::vector<int> mix_mat(num_mixed, 0);
-
-      // setup matlist and mix_next arrays
-      int last_z = -1;
-      for(int m = 0;m < num_mixed;++ m){
-        mix_mat[m] = sdom.mixed_material[m] + 1;
-        int z = sdom.mixed_to_zones[m];
-        if(matlist[z] == 0){
-            matlist[z] = -(1+m);
-        }
-        // if we are still on the same zone, make sure the last mix points
-        // here
-        if(z == last_z){
-          mix_next[m-1] = m+1;
-        }
-        last_z = z;
-      }
-
-      DBPutMaterial(proc, "material", "mesh", 3, matnos,
-          &matlist[0], sdom.nzones, 3,
-          &mix_next[0], &mix_mat[0], &sdom.mixed_to_zones[0], &sdom.mixed_fraction[0], num_mixed,
-          DB_DOUBLE, NULL);
-    }
-
-    // Write phi0
-    {
-
-      int num_zones = sdom.num_zones;
-      std::vector<double> phi0(num_zones);
-
-      // extract phi0 from phi for the 0th group
-      for(int z = 0;z < num_zones;++ z){
-        phi0[z] = (*sdom.phi)(0,0,z);
-      }
-
-      DBPutQuadvar1(proc, "phi0", "mesh", &phi0[0],
-          sdom.nzones, 3, NULL, 0, DB_DOUBLE, DB_ZONECENT, NULL);
-    }
-  }
-
-  // Close processor file
-  DBClose(proc);
-}
-#endif
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.h
deleted file mode 100644
index 542c71b79..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Grid.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_GRID_DATA_H__
-#define KRIPKE_GRID_DATA_H__
-
-#include <Kripke.h>
-#include <Kripke/Directions.h>
-#include <Kripke/Kernel.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/Timing.h>
-#include <vector>
-
-// Foreward Decl
-struct Input_Variables;
-struct SubTVec;
-
-
-/**
- * Contains all grid parameters and variables.
- */
-struct Grid_Data {
-public:
-  explicit Grid_Data(Input_Variables *input_vars);
-  ~Grid_Data();
-
-  void randomizeData(void);
-  void copy(Grid_Data const &b);
-  bool compare(Grid_Data const &b, double tol, bool verbose);
-  double particleEdit(void);
-#ifdef KRIPKE_USE_SILO
-  void writeSilo(std::string const &fname);
-#endif
-
-  Timing timing;
-
-  int niter;
-
-  double source_value;
-
-  std::vector<double> sigma_tot;            // Cross section data
-
-  int num_group_sets;                       // Number of group-sets
-  int num_groups_per_set;                   // How many groups in each set
-  int num_direction_sets;                   // Number of direction-sets
-  int num_directions_per_set;               // Number of directions per dir set
-  int num_zone_sets;                        // Number of zone sets
-  int legendre_order;                       // Legendra expansion order ( >= 0 )
-  int total_num_moments;                    // Number of spherical harmonic moments
-
-  std::vector<int> moment_to_coeff;         // Map from harmonic moments to legendre coefficients
-
-  std::vector<Directions> directions;       // Quadrature point data, for all directions
-  Kernel *kernel;                           // Layout-specific math kernels
-
-  std::vector<Subdomain> subdomains;        // Group/Angle/Zone set data
-  std::vector<int> zs_to_sdomid;            // map of zonesets to subdomains with ds=gs=0
-
-  // Variables:
-  SubTVec *sigs;                            // scattering lookup table for each material
-                                            // G=g->gp, D=legendre coeff, Z=matidx
-
-  // Per directionset ell and ell_plus matrices (Subdomain point into these arrays)
-  std::vector<SubTVec *> ell;               // L matrix in nm_offset coordinates
-  std::vector<SubTVec *> ell_plus;          // L+ matrix in nm_offset coordinates
-
-  // Per zoneset phi and phi_out (Subdomains point into these arrays)
-  std::vector<SubTVec *> phi;               // Moments of psi
-  std::vector<SubTVec *> phi_out;           // Scattering source
-
-  template<typename T>
-  inline int indexSize(int sdom_id){
-
-    // Get size of index from hash in the Subdomain object
-    //Subdomain &sdom = subdomains[sdom_id];
-    //sdom.index_size[T::getName()];
-    return subdomains[sdom_id].index_size[T::getName()];
-  }
-
-  template<typename T>
-  inline RAJA::RangeSegment indexRange(int sdom_id){
-
-    // Get size of index from hash in the Subdomain object
-    int len = indexSize<T>(sdom_id);
-
-    // Construct a range covering that Index
-    return RAJA::RangeSegment(0, len);
-  }
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.cpp
deleted file mode 100644
index b1f13d4b0..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Input_Variables.h>
-
-#ifdef KRIPKE_US_MPI
-#include<mpi.h>
-#endif
-
-/**
-* Setup the default input choices
-*/
-Input_Variables::Input_Variables() : 
-  nx(16), ny(16), nz(16),
-  num_directions(96),
-  num_groups(32),
-  legendre_order(4),
-  quad_num_polar(0),
-  quad_num_azimuthal(0),
- 
-  nesting(NEST_DGZ),
- 
-  npx(1), npy(1), npz(1),
-  num_dirsets(8),
-  num_groupsets(2),
-  layout_pattern(0),
-  
-  niter(10),
-  parallel_method(PMETHOD_SWEEP),
-  run_name("kripke")
-{
-  num_zonesets_dim[0] = 1; 
-  num_zonesets_dim[1] = 1;
-  num_zonesets_dim[2] = 1;
-
-  sigt[0] = 0.1;  
-  sigt[1] = 0.0001;
-  sigt[2] = 0.1;
-  
-  sigs[0] = 0.05;  
-  sigs[1] = 0.00005;
-  sigs[2] = 0.05; 
-}
-
-/**
- *  Checks validity of inputs, returns 'true' on error.
- */
-bool Input_Variables::checkValues(void) const{
-  // make sure any output only goes to root
-#ifdef KRIPKE_USE_MPI
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-#else
-  int rank = 0;
-#endif
-
-  if(num_zonesets_dim[0] <= 0 || num_zonesets_dim[1] <= 0 || num_zonesets_dim[2] <= 0){
-    if(!rank)
-      printf("Number of zone-sets in each dim need to be greater than or equal to 1\n");
-    return true;
-  }
-  
-  if(layout_pattern < 0 || layout_pattern > 1){
-    if(!rank)
-      printf("Layout(%d) must be either 0 or 1\n", layout_pattern);
-    return true;
-  }
-  
-  if(nesting < 0){
-    if(!rank)
-      printf("Invalid nesting selected\n");
-    return true;
-  }
-  
-  if(num_groups < 1){
-    if(!rank)
-      printf("Number of groups (%d) needs to be at least 1\n", num_groups);
-    return true;
-  }
-  
-  if(num_groups % num_groupsets){
-    if(!rank)
-      printf("Number of groups (%d) must be evenly divided by number of groupsets (%d)\n",
-        num_groups, num_groupsets);
-    return true;
-  }
-  
-  if(num_directions < 8){
-    if(!rank)
-      printf("Number of directions (%d) needs to be at least 8\n", num_directions);
-    return true;
-  }
-  
-  if(num_dirsets % 8 && num_dirsets < 8){
-    if(!rank)
-      printf("Number of direction sets (%d) must be a multiple of 8\n", num_dirsets);
-    return true;
-  }
-  
-  if(num_directions % num_dirsets){
-    if(!rank)
-      printf("Number of directions (%d) must be evenly divided by number of directionsets(%d)\n",
-        num_directions, num_dirsets);
-    return true;
-  }
-  
-  if(legendre_order < 0){
-    if(!rank)
-      printf("Legendre scattering order (%d) must be >= 0\n", legendre_order);
-    return true;
-  }
-  
-  if(niter < 1){
-    if(!rank)
-      printf("You must run at least one iteration (%d)\n", niter);
-    return true;
-  }
-  
-  return false;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.h
deleted file mode 100644
index 9d3f40573..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Input_Variables.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_INPUT_VARIABLES_H__
-#define KRIPKE_INPUT_VARIABLES_H__
-
-#include<Kripke.h>
-
-/**
- * This structure defines the input parameters to setup a problem.
- */
-
-struct Input_Variables {
-  Input_Variables();
-  
-  bool checkValues(void) const;
-  
-  // Problem Description
-  int nx, ny, nz;               // Number of spatial zones in x,y,z
-  int num_directions;           // Total number of directions
-  int num_groups;               // Total number of energy groups
-  int legendre_order;           // Scattering order (number Legendre coeff's - 1)
-  int quad_num_polar;           // Number of polar quadrature points (0 for dummy)
-  int quad_num_azimuthal;       // Number of azimuthal quadrature points (0 for dummy)
-
-  // On-Node Options
-  Nesting_Order nesting;        // Data layout and loop ordering (of Psi)
-  
-  // Parallel Decomp
-  int npx, npy, npz;            // The number of processors in x,y,z
-  int num_dirsets;              // Number of direction sets
-  int num_groupsets;            // Number of energy group sets
-  int num_zonesets_dim[3];      // Number of zoneset in x, y, z  
-  int layout_pattern;           // Which subdomain/task layout to use
-  
-  // Physics and Solver Options
-  int niter;                    // number of solver iterations to run
-  ParallelMethod parallel_method;
-  double sigt[3];               // total cross section for 3 materials
-  double sigs[3];               // total scattering cross section for 3 materials
-  
-  // Output Options
-  std::string run_name;         // Name to use when generating output files
-#ifdef KRIPKE_USE_SILO
-  std::string silo_basename;    // name prefix for silo output files
-#endif
-
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.cpp
deleted file mode 100644
index 41914d77a..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.cpp
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-#include<RAJA/RAJA.hxx>
-
-#include<Kripke/Kernel/Kernel_3d_GDZ.h>
-#include<Kripke/Kernel/Kernel_3d_DGZ.h>
-#include<Kripke/Kernel/Kernel_3d_ZDG.h>
-#include<Kripke/Kernel/Kernel_3d_DZG.h>
-#include<Kripke/Kernel/Kernel_3d_ZGD.h>
-#include<Kripke/Kernel/Kernel_3d_GZD.h>
-
-#include<Kripke/Kernel/DataPolicy.h>
-
-#include<Kripke/Kernel/LTimesPolicy.h>
-#include<Kripke/Kernel/LPlusTimesPolicy.h>
-#include<Kripke/Kernel/ScatteringPolicy.h>
-#include<Kripke/Kernel/SourcePolicy.h>
-#include<Kripke/Kernel/SweepPolicy.h>
-#include<Kripke/Kernel/ParticleEditPolicy.h>
-
-// For now, if CUDA is being used, then we are using functors for the kernels 
-// instead of lambdas.  CUDA8's __host__ __device__ lambdas might fix this
-// restriction
-#ifdef RAJA_ENABLE_CUDA
-#define KRIPKE_USE_FUNCTORS
-#else
-
-// Uncomment the next line to force the use of functors
-//#define KRIPKE_USE_FUNCTORS
-
-#endif
-
-
-#ifdef KRIPKE_USE_FUNCTORS
-#include<Kripke/KernelFunctors.h>
-#endif
-
-/*
- This function provides a mapping from the runtime Nesting_Order variable to a
- compile-time type (ie NEST_DZG_T, etc.), passing that type as the first 
- argument to the callable "kernel".
-*/
-template<typename KERNEL, typename ... ARGS>
-RAJA_INLINE
-void callKernelWithPolicy(Nesting_Order nesting_order, KERNEL kernel, ARGS & ... args){
-  switch(nesting_order){
-    case NEST_DGZ: kernel(NEST_DGZ_T(), args...); break;
-#ifndef RAJA_COMPILER_ICC
-    case NEST_DZG: kernel(NEST_DZG_T(), args...); break;
-    case NEST_GDZ: kernel(NEST_GDZ_T(), args...); break;
-    case NEST_GZD: kernel(NEST_GZD_T(), args...); break;
-    case NEST_ZDG: kernel(NEST_ZDG_T(), args...); break;
-    case NEST_ZGD: kernel(NEST_ZGD_T(), args...); break;
-#else
-    default: KripkeAbort("All nesting orders except DGZ are currently disabled with the Intel compilers\n");
-#endif
-  }
-}
-
-
-/**
- * Factory to create a kernel object for the specified nesting
- */
-Kernel *createKernel(Nesting_Order nest, int num_dims){
-  if(num_dims == 3){
-    switch(nest){
-    case NEST_GDZ:
-      return new Kernel_3d_GDZ();
-    case NEST_DGZ:
-      return new Kernel_3d_DGZ();
-    case NEST_ZDG:
-      return new Kernel_3d_ZDG();
-    case NEST_DZG:
-      return new Kernel_3d_DZG();
-    case NEST_ZGD:
-      return new Kernel_3d_ZGD();
-    case NEST_GZD:
-      return new Kernel_3d_GZD();
-    }
-  }
-
-  KripkeAbort("Unknown nesting order %d or number of dimensions %d\n", (int)nest, num_dims);
-  return NULL;
-}
-
-
-Kernel::Kernel(Nesting_Order nest) :
-  nesting_order(nest)
-{}
-
-Kernel::~Kernel(){
-}
-
-
-
-struct Kernel_LTimes{
-
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain) const {
-
-    typedef DataPolicy<nest_type> POL;
-    
-    using PHI = typename POL::View_Phi;
-    using PSI = typename POL::View_Psi;
-    using ELL = typename POL::View_Ell; 
-    
-    // Zero Phi
-    FORALL_ZONESETS(seq_pol, domain, sdom_id, sdom)
-      sdom.phi->clear(0.0);
-    END_FORALL
-
-    // Loop over Subdomains
-    FORALL_SUBDOMAINS(seq_pol, domain, sdom_id, sdom)
-
-      // Get dimensioning
-      int group0 = sdom.group0;
-
-      // Get pointers
-      PSI psi(domain, sdom_id, sdom.psi->ptr());
-      PHI phi(domain, sdom_id, sdom.phi->ptr());
-      ELL ell(domain, sdom_id, sdom.ell->ptr());
-
-#ifdef KRIPKE_USE_FUNCTORS
-      dForallN<LTimesPolicy<nest_type>, IMoment, IDirection, IGroup, IZone>(
-        domain, sdom_id, 
-        LTimesFcn<PHI, ELL, PSI>(phi, ell, psi, group0)
-      );
-#else
-      dForallN<LTimesPolicy<nest_type>, IMoment, IDirection, IGroup, IZone>(
-        domain, sdom_id,
-        RAJA_LAMBDA (IMoment nm, IDirection d, IGroup g, IZone z){
-
-          IGlobalGroup g_global( (*g) + group0);
-
-          phi(nm, g_global, z) += ell(d, nm) * psi(d, g, z);
-        }
-      );
-#endif
-
-    END_FORALL
-  }
-};
-
-
-void Kernel::LTimes(Grid_Data *domain) {
-  callKernelWithPolicy(nesting_order, Kernel_LTimes(), *domain);
-}
-
-
-
-struct Kernel_LPlusTimes {
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain) const {
-
-    typedef DataPolicy<nest_type> POL;
-
-    using PHI      = typename POL::View_Phi;
-    using PSI      = typename POL::View_Psi;
-    using ELL_PLUS = typename POL::View_EllPlus;
-
-    // Zero Phi
-    FORALL_SUBDOMAINS(seq_pol, domain, sdom_id, sdom)
-      sdom.rhs->clear(0.0);
-    END_FORALL
-
-    // Loop over Subdomains
-    FORALL_SUBDOMAINS(seq_pol, domain, sdom_id, sdom)
-
-      // Get dimensioning
-      int group0 = sdom.group0;
-
-      // Get pointers
-      PSI      rhs     (domain, sdom_id, sdom.rhs->ptr());
-      PHI      phi_out (domain, sdom_id, sdom.phi_out->ptr());
-      ELL_PLUS ell_plus(domain, sdom_id, sdom.ell_plus->ptr());
-
-#ifdef KRIPKE_USE_FUNCTORS
-      dForallN<LPlusTimesPolicy<nest_type>, IMoment, IDirection, IGroup, IZone>(
-        domain, sdom_id,
-        LPlusTimesFcn<PSI, ELL_PLUS, PHI>(rhs, ell_plus, phi_out, group0)
-      );
-#else
-      dForallN<LPlusTimesPolicy<nest_type>, IMoment, IDirection, IGroup, IZone>(
-        domain, sdom_id,
-        RAJA_LAMBDA (IMoment nm, IDirection d, IGroup g, IZone z){
-
-          IGlobalGroup g_global( (*g) + group0);
-
-          rhs(d, g, z) += ell_plus(d, nm) * phi_out(nm, g_global, z);
-        }
-      );
-#endif
-
-    END_FORALL
-  }
-};
-
-void Kernel::LPlusTimes(Grid_Data *domain) {
-  callKernelWithPolicy(nesting_order, Kernel_LPlusTimes(), *domain);
-}
-
-
-
-
-
-
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-struct Kernel_Scattering{
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain) const {
-    
-    typedef DataPolicy<nest_type> POL;
-
-    // Zero out source terms
-    FORALL_ZONESETS(seq_pol, domain, sdom_id, sdom)
-      sdom.phi_out->clear(0.0);
-    END_FORALL
-
-    // Loop over zoneset subdomains
-    FORALL_ZONESETS(seq_pol, domain, sdom_id, sdom)
-
-      typename POL::View_Phi     phi    (domain, sdom_id, sdom.phi->ptr());
-      typename POL::View_Phi     phi_out(domain, sdom_id, sdom.phi_out->ptr());
-      typename POL::View_SigS    sigs   (domain, sdom_id, domain.sigs->ptr());
-
-      typename POL::View_MixedToZones    mixed_to_zones (domain, sdom_id, (IZone*)&sdom.mixed_to_zones[0]);
-      typename POL::View_MixedToMaterial mixed_material (domain, sdom_id, (IMaterial*)&sdom.mixed_material[0]);
-      typename POL::View_MixedToFraction mixed_fraction (domain, sdom_id, &sdom.mixed_fraction[0]);
-      typename POL::View_MomentToCoeff   moment_to_coeff(domain, sdom_id, (ILegendre*)&domain.moment_to_coeff[0]);
-
-#ifdef KRIPKE_USE_FUNCTORS
-      dForallN<ScatteringPolicy<nest_type>, IMoment, IGlobalGroup, IGlobalGroup, IMix>(
-        domain, sdom_id,
-        ScatteringFcn<typename POL::View_Phi,
-                      typename POL::View_SigS,
-                      typename POL::View_MixedToZones,
-                      typename POL::View_MixedToMaterial,
-                      typename POL::View_MixedToFraction,
-                      typename POL::View_MomentToCoeff>
-                    (phi, phi_out, sigs, mixed_to_zones, mixed_material, mixed_fraction, moment_to_coeff)
-      );
-#else
-      dForallN<ScatteringPolicy<nest_type>, IMoment, IGlobalGroup, IGlobalGroup, IMix>(
-        domain, sdom_id,
-        RAJA_LAMBDA (IMoment nm, IGlobalGroup g, IGlobalGroup gp, IMix mix){
-        
-          ILegendre n = moment_to_coeff(nm);
-          IZone zone = mixed_to_zones(mix);
-          IMaterial material = mixed_material(mix);
-          double fraction = mixed_fraction(mix);
-
-          phi_out(nm, gp, zone) +=
-            sigs(n, g, gp, material) * phi(nm, g, zone) * fraction;
-
-        });  
-#endif
-    END_FORALL // zonesets
-  }
-
-};
-
-void Kernel::scattering(Grid_Data *domain) {
-  callKernelWithPolicy(nesting_order, Kernel_Scattering(), *domain);
-}
-
-
-
-  
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-
-struct Kernel_Source {
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain) const {
-    typedef DataPolicy<nest_type> POL;
-
-    // Loop over zoneset subdomains
-    FORALL_ZONESETS(seq_pol, domain, sdom_id, sdom)
-      typename POL::View_Phi             phi_out       (domain, sdom_id, sdom.phi_out->ptr());
-      typename POL::View_MixedToZones    mixed_to_zones(domain, sdom_id, (IZone*)&sdom.mixed_to_zones[0]);
-      typename POL::View_MixedToMaterial mixed_material(domain, sdom_id, (IMaterial*)&sdom.mixed_material[0]);
-      typename POL::View_MixedToFraction mixed_fraction(domain, sdom_id, &sdom.mixed_fraction[0]);
-
-#ifdef KRIPKE_USE_FUNCTORS
-      dForallN<SourcePolicy<nest_type>, IGlobalGroup, IMix>(
-        domain, sdom_id,
-        SourceFcn<typename POL::View_Phi,
-                  typename POL::View_MixedToZones,
-                  typename POL::View_MixedToMaterial,
-                  typename POL::View_MixedToFraction>
-                (phi_out, mixed_to_zones, mixed_material, mixed_fraction)
-      );
-      
-
-#else
-      dForallN<SourcePolicy<nest_type>, IGlobalGroup, IMix>(
-        domain, sdom_id,
-        RAJA_LAMBDA (IGlobalGroup g, IMix mix){
-          IZone zone = mixed_to_zones(mix);
-          IMaterial material = mixed_material(mix);
-          double fraction = mixed_fraction(mix);
-
-          if(*material == 0){
-            phi_out(IMoment(0), g, zone) += 1.0 * fraction;
-          }
-      }); 
-#endif
-    END_FORALL
-  }
-};
-
-void Kernel::source(Grid_Data *domain) {
-  callKernelWithPolicy(nesting_order, Kernel_Source(), *domain);
-}
-
-
-
-
-struct Kernel_Sweep{
-
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain, int sdom_id) const {
-    
-    typedef DataPolicy<nest_type> POL;
-
-    Subdomain *sdom = &domain.subdomains[sdom_id];
-
-    typename POL::View_Directions direction(domain, sdom_id, sdom->directions);
-
-    typename POL::View_Psi     rhs (domain, sdom_id, sdom->rhs->ptr());
-    typename POL::View_Psi     psi (domain, sdom_id, sdom->psi->ptr());
-    typename POL::View_SigT    sigt(domain, sdom_id, sdom->sigt->ptr());
-
-    typename POL::View_dx      dx(domain, sdom_id, &sdom->deltas[0][0]);
-    typename POL::View_dy      dy(domain, sdom_id, &sdom->deltas[1][0]);
-    typename POL::View_dz      dz(domain, sdom_id, &sdom->deltas[2][0]);
-
-    typename POL::TLayout_Zone zone_layout(domain, sdom_id);
-
-    typename POL::View_FaceI face_lf(domain, sdom_id, sdom->plane_data[0]->ptr());
-    typename POL::View_FaceJ face_fr(domain, sdom_id, sdom->plane_data[1]->ptr());
-    typename POL::View_FaceK face_bo(domain, sdom_id, sdom->plane_data[2]->ptr());
-
-    // All directions have same id,jd,kd, since these are all one Direction Set
-    // So pull that information out now
-    Grid_Sweep_Block const &extent = sdom->sweep_block;
-    typename POL::View_IdxToI  idx_to_i(domain, sdom_id, (IZoneI*)&extent.idx_to_i[0]);
-    typename POL::View_IdxToJ  idx_to_j(domain, sdom_id, (IZoneJ*)&extent.idx_to_j[0]);
-    typename POL::View_IdxToK  idx_to_k(domain, sdom_id, (IZoneK*)&extent.idx_to_k[0]);
-
-#ifdef KRIPKE_USE_FUNCTORS
-    RAJA::forallN<SweepPolicy<nest_type>, IDirection, IGroup, IZoneIdx>( 
-      domain.indexRange<IDirection>(sdom_id),
-      domain.indexRange<IGroup>(sdom_id),
-      extent.indexset_sweep,
-      SweepFcn<typename POL::View_Directions,
-               typename POL::View_Psi,
-               typename POL::View_SigT,
-               typename POL::View_dx,
-               typename POL::View_dy,
-               typename POL::View_dz,
-               typename POL::TLayout_Zone,
-               typename POL::View_FaceI,
-               typename POL::View_FaceJ,
-               typename POL::View_FaceK,
-               typename POL::View_IdxToI,
-               typename POL::View_IdxToJ,
-               typename POL::View_IdxToK>
-               (direction, rhs, psi, sigt, dx, dy, dz, zone_layout, 
-                face_lf, face_fr, face_bo, idx_to_i, idx_to_j, idx_to_k)
-    );
-#else
-
-    RAJA::forallN<SweepPolicy<nest_type>, IDirection, IGroup, IZoneIdx>( 
-      domain.indexRange<IDirection>(sdom_id),
-      domain.indexRange<IGroup>(sdom_id),
-      extent.indexset_sweep,
-      RAJA_LAMBDA (IDirection d, IGroup g, IZoneIdx zone_idx){
-
-        IZoneI i = idx_to_i(zone_idx);
-        IZoneJ j = idx_to_j(zone_idx);
-        IZoneK k = idx_to_k(zone_idx);
-
-        double const xcos_dxi = 2.0 * direction(d).xcos / dx(i+1);
-        double const ycos_dyj = 2.0 * direction(d).ycos / dy(j+1);
-        double const zcos_dzk = 2.0 * direction(d).zcos / dz(k+1);
-
-        IZone z = zone_layout(i,j,k);
-
-        // Calculate new zonal flux
-        double const psi_d_g_z = (
-              rhs(d,g,z)
-            + face_lf(d,g,j,k) * xcos_dxi
-            + face_fr(d,g,i,k) * ycos_dyj
-            + face_bo(d,g,i,j) * zcos_dzk)
-            / (xcos_dxi + ycos_dyj + zcos_dzk + sigt(g,z) );
-
-        psi(d,g,z) = psi_d_g_z;
-
-        // Apply diamond-difference relationships
-        face_lf(d,g,j,k) = 2.0 * psi_d_g_z - face_lf(d,g,j,k);
-        face_fr(d,g,i,k) = 2.0 * psi_d_g_z - face_fr(d,g,i,k);
-        face_bo(d,g,i,j) = 2.0 * psi_d_g_z - face_bo(d,g,i,j);
-      }); 
-#endif
-  }
-};
-
-
-void Kernel::sweep(Grid_Data *domain, int sdom_id) {
-  callKernelWithPolicy(nesting_order, Kernel_Sweep(), *domain, sdom_id);
-}
-
-/**
- *  Edit that sums up number of particles on mesh.
- */
-
-struct Kernel_ParticleEdit {
-
-  double &part;
-
-  Kernel_ParticleEdit(double &t) : part(t) {}
-
-  template<typename nest_type>
-  RAJA_INLINE
-  void operator()(nest_type, Grid_Data &domain) const {
-    typedef DataPolicy<nest_type> POL;
- 
-    RAJA::ReduceSum<typename POL::reduce_policy, double> part_reduce(0.0);
-       
-    // Loop over zoneset subdomains
-    FORALL_SUBDOMAINS(seq_pol, domain, sdom_id, sdom)
-      typename POL::View_Psi         psi      (domain, sdom_id, sdom.psi->ptr());
-      typename POL::View_Directions  direction(domain, sdom_id, sdom.directions);
-      typename POL::View_Volume      volume   (domain, sdom_id, &sdom.volume[0]);
-
-      
-#ifdef KRIPKE_USE_FUNCTORS
-      dForallN<ParticleEditPolicy<nest_type>, IDirection, IGroup, IZone>( 
-        domain, sdom_id,
-        ParticleEditFcn<decltype(part_reduce),
-                  typename POL::View_Directions,
-                  typename POL::View_Psi,
-                  typename POL::View_Volume>
-                (part_reduce, direction, psi, volume)
-      );
-#else
-      dForallN<ParticleEditPolicy<nest_type>, IDirection, IGroup, IZone>( 
-        domain, sdom_id,
-        RAJA_LAMBDA (IDirection d, IGroup g, IZone z){
-          part_reduce += direction(d).w * psi(d,g,z) * volume(z);              
-        }
-      ); 
-#endif
-    END_FORALL
-    
-    part = part_reduce;
-    
-    // reduce across MPI
-#ifdef KRIPKE_USE_MPI
-    double part_global;
-
-    MPI_Reduce(&part, &part_global, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-
-    part = part_global;
-#endif
-  }
-};
-
-double Kernel::particleEdit(Grid_Data *domain) {
-  double total = 0.0;
-  callKernelWithPolicy(nesting_order, Kernel_ParticleEdit(total), *domain);
-  return total;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.h
deleted file mode 100644
index e6ed33ef6..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_H__
-#define KRIPKE_KERNEL_H__
-
-#include <Kripke.h>
-
-struct Grid_Data;
-struct SubTVec;
-struct Subdomain;
-
-/**
- * This is the Kernel base-class and interface definition.
- * This abstracts the storage of Psi, Phi, L, L+ from the rest of the code,
- * providing data-layout specific routines.
- */
-class Kernel {
-  public:
-    explicit Kernel(Nesting_Order nest);
-    virtual ~Kernel();
-    virtual Nesting_Order nestingPsi(void) const = 0;
-    virtual Nesting_Order nestingPhi(void) const = 0;
-    virtual Nesting_Order nestingSigt(void) const = 0;
-    virtual Nesting_Order nestingEll(void) const = 0;
-    virtual Nesting_Order nestingEllPlus(void) const = 0;
-    virtual Nesting_Order nestingSigs(void) const = 0;
-
-    // Computational Kernels
-    void LTimes(Grid_Data *grid_data);
-    void LPlusTimes(Grid_Data *grid_data);
-    void scattering(Grid_Data *grid_data);
-    void source(Grid_Data *grid_data);
-    void sweep(Grid_Data *domain, int sdom_id);
-    double particleEdit(Grid_Data *domain);
-  private:
-    Nesting_Order nesting_order;
-};
-
-
-// Factory to create correct kernel object
-Kernel *createKernel(Nesting_Order, int num_dims);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/DataPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/DataPolicy.h
deleted file mode 100644
index c0d6b540d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/DataPolicy.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_VARIABLE_POLICY_H__
-#define KERNEL_VARIABLE_POLICY_H__
-
-#include<Kripke.h>
-#include<Kripke/Directions.h>
-#include<Kripke/DView.h>
-
-
-/*
- * Define strongly-typed indices used in Kripke
- */
-RAJA_INDEX_VALUE(IMaterial,    "IMaterial");     // Material ID
-RAJA_INDEX_VALUE(ILegendre,    "ILegendre");     // Legendre expansion coefficient
-RAJA_INDEX_VALUE(IMoment,      "IMoment");       // Spherical harmonic moment
-RAJA_INDEX_VALUE(IDirection,   "IDirection");    // Local direction
-RAJA_INDEX_VALUE(IGlobalGroup, "IGlobalGroup");  // Global energy group
-RAJA_INDEX_VALUE(IGroup,       "IGroup");        // Local energy group
-RAJA_INDEX_VALUE(IZone,        "IZone");         // Cannonical zone number
-RAJA_INDEX_VALUE(IZoneIdx,     "IZoneIdx");      // Mapped zone index (sequential in hyperplane)
-RAJA_INDEX_VALUE(IMix,         "IMix");          // Mixed element slot
-RAJA_INDEX_VALUE(IZoneI,       "IZoneI");        // zone on the I boundary face
-RAJA_INDEX_VALUE(IZoneJ,       "IZoneJ");        // zone on the K boundary face
-RAJA_INDEX_VALUE(IZoneK,       "IZoneK");        // zone on the K boundary face
-
-
-
-/**
- * Layout policies that don't change with nesting.
- */
-struct FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_JI, IDirection, IMoment> Layout_Ell;
-  typedef DLayout<int, RAJA::PERM_IJ, IDirection, IMoment> Layout_EllPlus;
-
-  typedef DLayout<IZone, RAJA::PERM_KJI, IZoneI, IZoneJ, IZoneK> TLayout_Zone;
-};
-
-
-/**
- * Layout policies tied directly to nesting.
- */
-template<typename T>
-struct NestingPolicy{};
-
-template<>
-struct NestingPolicy<NEST_DGZ_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_IJK, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_IJK, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_IJKL, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_IJ, IGroup, IZone> Layout_SigT;
-  
-  typedef DLayout<int, RAJA::PERM_IJLK, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_IJLK, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_IJLK, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-template<>
-struct NestingPolicy<NEST_DZG_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_IKJ, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_IKJ, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_ILJK, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_JI, IGroup, IZone> Layout_SigT;
-
-  typedef DLayout<int, RAJA::PERM_ILKJ, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_ILKJ, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_ILKJ, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-template<>
-struct NestingPolicy<NEST_GDZ_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_JIK, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_JIK, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_JKIL, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_IJ, IGroup, IZone> Layout_SigT;
-
-  typedef DLayout<int, RAJA::PERM_JILK, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_JILK, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_JILK, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-template<>
-struct NestingPolicy<NEST_GZD_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_JKI, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_JKI, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_JKLI, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_IJ, IGroup, IZone> Layout_SigT;
-
-  typedef DLayout<int, RAJA::PERM_JLKI, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_JLKI, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_JLKI, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-template<>
-struct NestingPolicy<NEST_ZDG_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_KIJ, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_KIJ, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_LIJK, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_JI, IGroup, IZone> Layout_SigT;
-
-  typedef DLayout<int, RAJA::PERM_LKIJ, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_LKIJ, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_LKIJ, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-template<>
-struct NestingPolicy<NEST_ZGD_T> : public FixedLayoutPolicy {
-  typedef DLayout<int, RAJA::PERM_KJI, IDirection, IGroup, IZone>    Layout_Psi;
-  typedef DLayout<int, RAJA::PERM_KJI, IMoment, IGlobalGroup, IZone> Layout_Phi;
-  typedef DLayout<int, RAJA::PERM_LJKI, ILegendre, IGlobalGroup, IGlobalGroup, IMaterial> Layout_SigS;
-  typedef DLayout<int, RAJA::PERM_JI, IGroup, IZone> Layout_SigT;
-
-  typedef DLayout<int, RAJA::PERM_LKJI, IDirection, IGroup, IZoneJ, IZoneK> Layout_FaceI;
-  typedef DLayout<int, RAJA::PERM_LKJI, IDirection, IGroup, IZoneI, IZoneK> Layout_FaceJ;
-  typedef DLayout<int, RAJA::PERM_LKJI, IDirection, IGroup, IZoneI, IZoneJ> Layout_FaceK;
-};
-
-
-/**
- * Views that have fixed policies
- */
-struct FixedViewPolicy {
-  typedef DView<double, DLayout<int, RAJA::PERM_I, IZoneI> > View_dx;
-  typedef DView<double, DLayout<int, RAJA::PERM_I, IZoneJ> > View_dy;
-  typedef DView<double, DLayout<int, RAJA::PERM_I, IZoneK> > View_dz;
-  typedef DView<Directions, DLayout<int, RAJA::PERM_I, IDirection> > View_Directions;
-  typedef DView<double, DLayout<int, RAJA::PERM_I, IZone> > View_Volume;
-  
-  typedef DView<IZoneI, DLayout<int, RAJA::PERM_I, IZoneIdx> > View_IdxToI;
-  typedef DView<IZoneJ, DLayout<int, RAJA::PERM_I, IZoneIdx> > View_IdxToJ;
-  typedef DView<IZoneK, DLayout<int, RAJA::PERM_I, IZoneIdx> > View_IdxToK;
-
-  typedef DView<IZone, DLayout<int, RAJA::PERM_I, IMix> > View_MixedToZones;
-  typedef DView<IMaterial, DLayout<int, RAJA::PERM_I, IMix> > View_MixedToMaterial;
-  typedef DView<double, DLayout<int, RAJA::PERM_I, IMix> > View_MixedToFraction;
-  typedef DView<ILegendre, DLayout<int, RAJA::PERM_I, IMoment> > View_MomentToCoeff;
-};
-
-/**
- * Views with policies that vary between nestings.
- */
-template<typename T>
-struct ViewPolicy : public FixedViewPolicy {
-  // Discrete and Moment Unknowns
-  typedef DView<double, typename T::Layout_Psi> View_Psi;
-  typedef DView<double, typename T::Layout_Phi> View_Phi;
-
-  // Spatial domain face indices
-  typedef DView<double, typename T::Layout_FaceI> View_FaceI;
-  typedef DView<double, typename T::Layout_FaceJ> View_FaceJ;
-  typedef DView<double, typename T::Layout_FaceK> View_FaceK;
-
-  // L and L+ matrices
-  typedef DView<double, typename T::Layout_Ell> View_Ell;
-  typedef DView<double, typename T::Layout_EllPlus> View_EllPlus;
-
-  // Data tables
-  typedef DView<double, typename T::Layout_SigS> View_SigS;
-  typedef DView<double, typename T::Layout_SigT> View_SigT;
-  
-#ifdef RAJA_ENABLE_OPENMP
-  typedef RAJA::omp_reduce reduce_policy;
-#else
-  typedef RAJA::seq_reduce reduce_policy;
-#endif
-};
-
-
-/**
- * Combined Policies for Layouts, Views.
- *
- * A convenience class: makes it easier to include in application.
- */
-struct FixedDataPolicy {
-  static const int memory_alignment = 64;
-};
-
-template<typename T>
-struct DataPolicy : public FixedDataPolicy, public NestingPolicy<T>, public ViewPolicy<NestingPolicy<T> >
-{
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.cpp
deleted file mode 100644
index 8b498bf7f..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_DGZ.h>
-
-Kernel_3d_DGZ::Kernel_3d_DGZ() :
-  Kernel(NEST_DGZ)
-{}
-
-Kernel_3d_DGZ::~Kernel_3d_DGZ()
-{}
-
-Nesting_Order Kernel_3d_DGZ::nestingPsi(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingPhi(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingSigs(void) const {
-  return NEST_DGZ;
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.h
deleted file mode 100644
index d23dfc031..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DGZ.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_DGZ_H__
-#define KRIPKE_KERNEL_3D_DGZ_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_DGZ : public Kernel {
-  public:
-    Kernel_3d_DGZ();
-    virtual ~Kernel_3d_DGZ();
-    
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.cpp
deleted file mode 100644
index e825dfa3f..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_DZG.h>
-
-Kernel_3d_DZG::Kernel_3d_DZG() :
-  Kernel(NEST_DZG)
-{}
-
-Kernel_3d_DZG::~Kernel_3d_DZG()
-{}
-
-Nesting_Order Kernel_3d_DZG::nestingPsi(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingPhi(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingSigs(void) const {
-  return NEST_DZG;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.h
deleted file mode 100644
index 7831f1dae..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_DZG.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_DZG_H__
-#define KRIPKE_KERNEL_3D_DZG_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_DZG : public Kernel {
-  public:
-    Kernel_3d_DZG();
-    virtual ~Kernel_3d_DZG();
-    
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.cpp
deleted file mode 100644
index 3fa67c34e..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_GDZ.h>
-
-Kernel_3d_GDZ::Kernel_3d_GDZ() :
-  Kernel(NEST_GDZ)
-{}
-
-Kernel_3d_GDZ::~Kernel_3d_GDZ()
-{}
-
-Nesting_Order Kernel_3d_GDZ::nestingPsi(void) const {
-  return NEST_GDZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingPhi(void) const {
-  return NEST_GDZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingSigs(void) const {
-  return NEST_GDZ;
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.h
deleted file mode 100644
index 200b0f05b..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GDZ.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_GDZ_H__
-#define KRIPKE_KERNEL_3D_GDZ_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_GDZ : public Kernel {
-  public:
-    Kernel_3d_GDZ();
-    virtual ~Kernel_3d_GDZ();
-  
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.cpp
deleted file mode 100644
index 19d1426eb..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_GZD.h>
-
-Kernel_3d_GZD::Kernel_3d_GZD() :
-  Kernel(NEST_GZD)
-{}
-
-Kernel_3d_GZD::~Kernel_3d_GZD()
-{}
-
-Nesting_Order Kernel_3d_GZD::nestingPsi(void) const {
-  return NEST_GZD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingPhi(void) const {
-  return NEST_GZD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingSigs(void) const {
-  return NEST_GZD;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.h
deleted file mode 100644
index 590b635c6..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_GZD.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_GZD_H__
-#define KRIPKE_KERNEL_3D_GZD_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_GZD : public Kernel {
-  public:
-    Kernel_3d_GZD();
-    virtual ~Kernel_3d_GZD();
-    
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.cpp
deleted file mode 100644
index dbc8ab28e..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_ZDG.h>
-
-Kernel_3d_ZDG::Kernel_3d_ZDG() :
-  Kernel(NEST_ZDG)
-{}
-
-Kernel_3d_ZDG::~Kernel_3d_ZDG()
-{}
-
-Nesting_Order Kernel_3d_ZDG::nestingPsi(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingPhi(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingSigs(void) const {
-  return NEST_ZDG;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.h
deleted file mode 100644
index d67059883..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZDG.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_ZDG_H__
-#define KRIPKE_KERNEL_3D_ZDG_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_ZDG : public Kernel {
-  public:
-    Kernel_3d_ZDG();
-    virtual ~Kernel_3d_ZDG();
-    
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.cpp
deleted file mode 100644
index 11b9b57a9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_ZGD.h>
-
-Kernel_3d_ZGD::Kernel_3d_ZGD() :
-  Kernel(NEST_ZGD)
-{}
-
-Kernel_3d_ZGD::~Kernel_3d_ZGD()
-{}
-
-Nesting_Order Kernel_3d_ZGD::nestingPsi(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingPhi(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingSigs(void) const {
-  return NEST_ZGD;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.h
deleted file mode 100644
index 93bf8d5c1..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/Kernel_3d_ZGD.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_ZGD_H__
-#define KRIPKE_KERNEL_3D_ZGD_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_ZGD : public Kernel {
-  public:
-    Kernel_3d_ZGD();
-    virtual ~Kernel_3d_ZGD();
-
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LPlusTimesPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LPlusTimesPolicy.h
deleted file mode 100644
index 53ae3f767..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LPlusTimesPolicy.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_LPLUSTIMES_POLICY_H__
-#define KERNEL_LPLUSTIMES_POLICY_H__
-
-#include<Kripke.h>
-
-
-template<typename T>
-struct LPlusTimesPolicy{}; // nm, d, g, z
-
-#ifdef RAJA_COMPILER_ICC
-template<>
-struct LPlusTimesPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::simd_exec>
-                                  >
-{};
-
-#else
-template<>
-struct LPlusTimesPolicy<NEST_DGZ_T> : RAJA::NestedPolicy< 
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       RAJA::simd_exec>,
-                                        kripke_OMP_Parallel<
-                                          RAJA::Tile<
-                                            RAJA::TileList<RAJA::tile_none, 
-                                                           RAJA::tile_none, 
-                                                           RAJA::tile_none, 
-                                                           RAJA::tile_fixed<512>>,
-                                            RAJA::Permute<RAJA::PERM_JIKL>
-                                          >
-                                        >
-                                      >
-{};
-#endif
-
-
-template<>
-struct LPlusTimesPolicy<NEST_DZG_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_LIJK>
-                                      >
-{};
-
-template<>
-struct LPlusTimesPolicy<NEST_GDZ_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       RAJA::seq_exec>, 
-                                        RAJA::Permute<RAJA::PERM_KIJL>
-                                      >
-{};
-
-template<>
-struct LPlusTimesPolicy<NEST_GZD_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_KLIJ>
-                                      >
-{};
-
-template<>
-struct LPlusTimesPolicy<NEST_ZDG_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_LIJK>
-                                      >
-{};
-
-template<>
-struct LPlusTimesPolicy<NEST_ZGD_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_LKIJ>
-                                      >
-{};
-
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LTimesPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LTimesPolicy.h
deleted file mode 100644
index b282e90bd..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/LTimesPolicy.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_LTIMES_POLICY_H__
-#define KERNEL_LTIMES_POLICY_H__
-
-#include<Kripke.h>
-
-
-template<typename T>
-struct LTimesPolicy{}; // nm, d, g, z
-
-#ifdef RAJA_COMPILER_ICC
-template<>
-struct LTimesPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::simd_exec>
-                                  >
-{};
-
-#else
-template<>
-struct LTimesPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   RAJA::simd_exec>,
-                                    kripke_OMP_Parallel<
-                                      RAJA::Tile<
-                                        RAJA::TileList<RAJA::tile_none, 
-                                                       RAJA::tile_none, 
-                                                       RAJA::tile_none, 
-                                                       RAJA::tile_fixed<512>>,
-                                        RAJA::Permute<RAJA::PERM_IJKL>
-                                      >
-                                    >
-                                  >
-{};
-#endif
-template<>
-struct LTimesPolicy<NEST_DZG_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec>, 
-                                    RAJA::Permute<RAJA::PERM_LIJK>
-                                  >
-{};
-
-template<>
-struct LTimesPolicy<NEST_GDZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   RAJA::seq_exec>, 
-                                    RAJA::Permute<RAJA::PERM_KIJL>
-                                  >
-{};
-
-template<>
-struct LTimesPolicy<NEST_GZD_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   kripke_omp_for_nowait_exec>, 
-                                    RAJA::Permute<RAJA::PERM_KLIJ>
-                                  >
-{};
-
-template<>
-struct LTimesPolicy<NEST_ZDG_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec>, 
-                                    RAJA::Permute<RAJA::PERM_LIJK>
-                                  >
-{}; 
-
-template<>
-struct LTimesPolicy<NEST_ZGD_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   kripke_omp_for_nowait_exec>, 
-                                    RAJA::Permute<RAJA::PERM_LKIJ>
-                                  >
-{};
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ParticleEditPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ParticleEditPolicy.h
deleted file mode 100644
index c54208568..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ParticleEditPolicy.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_PARTICLEDIT_POLICY_H__
-#define KERNEL_PARTICLEDIT_POLICY_H__
-
-#include<Kripke.h>
-
-// There are really only 2 policies, based on group and zone ordering
-// So we define those here, and assign them to each nesting order
-
-using ParticleEditPolicy_CPU = RAJA::NestedPolicy<
-                                 RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                RAJA::simd_exec,
-                                                RAJA::simd_exec>,
-                                 kripke_OMP_Parallel<RAJA::Execute>
-			                         >;
-
-
-template<typename T>
-struct ParticleEditPolicy {}; // g,mix
-
-template<>
-struct ParticleEditPolicy<NEST_DGZ_T> : ParticleEditPolicy_CPU {};
-
-template<>
-struct ParticleEditPolicy<NEST_DZG_T> : ParticleEditPolicy_CPU {};
-
-template<>
-struct ParticleEditPolicy<NEST_GDZ_T> : ParticleEditPolicy_CPU {};
-
-template<>
-struct ParticleEditPolicy<NEST_GZD_T> : ParticleEditPolicy_CPU {};
-
-template<>
-struct ParticleEditPolicy<NEST_ZDG_T> : ParticleEditPolicy_CPU {};
-
-template<>
-struct ParticleEditPolicy<NEST_ZGD_T> : ParticleEditPolicy_CPU {};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ScatteringPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ScatteringPolicy.h
deleted file mode 100644
index 219e3ff86..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/ScatteringPolicy.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_SCATTERING_POLICY_H__
-#define KERNEL_SCATTERING_POLICY_H__
-
-#include<Kripke.h>
-
-
-template<typename T>
-struct ScatteringPolicy{}; // nm, g, gp, mat
-
-
-#ifdef RAJA_COMPILER_ICC
-template<>
-struct ScatteringPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   RAJA::simd_exec>
-                                  >
-{};
-
-#else
-template<>
-struct ScatteringPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       RAJA::simd_exec>,
-                                        kripke_OMP_Parallel<
-                                          RAJA::Permute<RAJA::PERM_IJKL>
-                                        >
-                                      >
-{};
-#endif
-
-template<>
-struct ScatteringPolicy<NEST_DZG_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec>, 
-                                        RAJA::Permute<RAJA::PERM_ILJK>
-                                      >
-{};
-
-template<>
-struct ScatteringPolicy<NEST_GDZ_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       RAJA::seq_exec>,
-                                        kripke_OMP_Parallel<
-                                          RAJA::Permute<RAJA::PERM_JKIL>
-                                        >
-                                      >
-{};
-
-template<>
-struct ScatteringPolicy<NEST_GZD_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec, 
-                                                       RAJA::seq_exec>, 
-                                        RAJA::Permute<RAJA::PERM_JKLI>
-                                      >
-{};
-
-template<>
-struct ScatteringPolicy<NEST_ZDG_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_LIJK>
-                                      >
-{};
-
-template<>
-struct ScatteringPolicy<NEST_ZGD_T> : RAJA::NestedPolicy<
-                                        RAJA::ExecList<RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       RAJA::seq_exec, 
-                                                       kripke_omp_for_nowait_exec>, 
-                                        RAJA::Permute<RAJA::PERM_LJKI>
-                                      >
-{};
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SourcePolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SourcePolicy.h
deleted file mode 100644
index e904840d9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SourcePolicy.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_SOURCE_POLICY_H__
-#define KERNEL_SOURCE_POLICY_H__
-
-#include<Kripke.h>
-
-// There are really only 2 policies, based on group and zone ordering
-// So we define those here, and assign them to each nesting order
-
-#ifdef RAJA_COMPILER_ICC
-using SourcePolicy_GZ = RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::simd_exec>
-                                  >;
-
-#else
-
-using SourcePolicy_GZ = RAJA::NestedPolicy<
-                          RAJA::ExecList<kripke_omp_collapse_nowait_exec, 
-                                         kripke_omp_collapse_nowait_exec>,
-                          kripke_OMP_Parallel<
-                            RAJA::Permute<RAJA::PERM_IJ>
-			                    >
-			                  >;
-#endif
-using SourcePolicy_ZG = RAJA::NestedPolicy<
-                          RAJA::ExecList<kripke_omp_collapse_nowait_exec, 
-                                         kripke_omp_collapse_nowait_exec>,
-                          kripke_OMP_Parallel<
-                            RAJA::Permute<RAJA::PERM_JI>
-			                    >
-			                  >;
-
-template<typename T>
-struct SourcePolicy {}; // g,mix
-
-template<>
-struct SourcePolicy<NEST_DGZ_T> : SourcePolicy_GZ {};
-
-template<>
-struct SourcePolicy<NEST_DZG_T> : SourcePolicy_ZG {};
-
-template<>
-struct SourcePolicy<NEST_GDZ_T> : SourcePolicy_GZ {};
-
-template<>
-struct SourcePolicy<NEST_GZD_T> : SourcePolicy_GZ {};
-
-template<>
-struct SourcePolicy<NEST_ZDG_T> : SourcePolicy_ZG {};
-
-template<>
-struct SourcePolicy<NEST_ZGD_T> : SourcePolicy_ZG {};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SweepPolicy.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SweepPolicy.h
deleted file mode 100644
index 689910be0..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Kernel/SweepPolicy.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KERNEL_SWEEP_POLICY_H__
-#define KERNEL_SWEEP_POLICY_H__
-
-#include<Kripke.h>
-  
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::simd_exec> sweep_seq_exec;
-#ifdef RAJA_ENABLE_OPENMP
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec> sweep_omp_exec;
-#else
-using sweep_omp_exec = sweep_seq_exec;
-#endif
-
-template<typename T>
-struct SweepPolicy{}; // d, g, z
-
-#ifdef RAJA_COMPILER_ICC
-template<>
-struct SweepPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   sweep_seq_exec>
-                                  >
-{};
-
-#else
-template<>
-struct SweepPolicy<NEST_DGZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<kripke_omp_collapse_nowait_exec, 
-                                                   kripke_omp_collapse_nowait_exec, 
-                                                   sweep_seq_exec>,
-                                    kripke_OMP_Parallel<RAJA::Execute>
-                                 >
-{};
-#endif
-template<>
-struct SweepPolicy<NEST_DZG_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                   RAJA::seq_exec, 
-                                                   sweep_seq_exec>, 
-                                    RAJA::Permute<RAJA::PERM_IKJ>
-                                 >
-{};
-
-template<>
-struct SweepPolicy<NEST_GDZ_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<kripke_omp_for_nowait_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   sweep_seq_exec>, 
-                                    RAJA::Permute<RAJA::PERM_JIK>
-                                 >
-{};
-
-template<>
-struct SweepPolicy<NEST_GZD_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   kripke_omp_for_nowait_exec, 
-                                                   sweep_seq_exec>, 
-                                    RAJA::Permute<RAJA::PERM_JKI>
-                                 >
-{};
-
-template<>
-struct SweepPolicy<NEST_ZDG_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   sweep_omp_exec>, 
-                                    RAJA::Permute<RAJA::PERM_KIJ>
-                                 >
-{};
-
-template<>
-struct SweepPolicy<NEST_ZGD_T> : RAJA::NestedPolicy<
-                                    RAJA::ExecList<RAJA::seq_exec, 
-                                                   RAJA::seq_exec, 
-                                                   sweep_omp_exec>, 
-                                    RAJA::Permute<RAJA::PERM_KJI>
-                                 >
-{};
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/KernelFunctors.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/KernelFunctors.h
deleted file mode 100644
index 02cfc8f05..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/KernelFunctors.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
- 
-#ifndef KRIPKE_KERNEL_FUNCTORS__
-#define KRIPKE_KERNEL_FUNCTORS__
-
-
-template<typename PHI, typename ELL, typename PSI>
-struct LTimesFcn {
-
-  int group0;
-  PHI phi;
-  ELL ell;
-  PSI psi;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  LTimesFcn(PHI const &phi_, ELL const &ell_, PSI const &psi_, int g0) : 
-    phi(phi_), ell(ell_), psi(psi_), group0(g0)
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE 
-  void operator()(IMoment nm, IDirection d, IGroup g, IZone z) const {
-
-    IGlobalGroup g_global( (*g) + group0);
-
-    phi(nm, g_global, z) += ell(d, nm) * psi(d, g, z);
-  }
-
-};
-
-
-
-template<typename PSI, typename ELL_PLUS, typename PHI>
-struct LPlusTimesFcn {
-
-  int      group0;
-  PSI      rhs;
-  ELL_PLUS ell_plus;
-  PHI      phi_out;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  LPlusTimesFcn(PSI const &rhs_, ELL_PLUS const &ell_plus_, PHI const &phi_out_, int g0) :
-    rhs(rhs_), ell_plus(ell_plus_), phi_out(phi_out_), group0(g0)
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE 
-  void operator()(IMoment nm, IDirection d, IGroup g, IZone z) const {
-
-    IGlobalGroup g_global( (*g) + group0);
-
-    rhs(d, g, z) += ell_plus(d, nm) * phi_out(nm, g_global, z);
-  }
-
-};
-
-
-
-template<typename PHI, typename SIGS, typename MZ, typename MM, typename MF, typename MC>
-struct ScatteringFcn {
-
-  PHI  phi;
-  PHI  phi_out;
-  SIGS sigs;
-
-  MZ   mixed_to_zones;
-  MM   mixed_material;
-  MF   mixed_fraction;
-  MC   moment_to_coeff;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  ScatteringFcn(PHI phi_, PHI phi_out_, SIGS sigs_, MZ mz, MM mm, MF mf, MC mc) :
-    phi(phi_), phi_out(phi_out_), sigs(sigs_),
-    mixed_to_zones(mz),
-    mixed_material(mm),
-    mixed_fraction(mf),
-    moment_to_coeff(mc)
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void operator()(IMoment nm, IGlobalGroup g, IGlobalGroup gp, IMix mix) const {
-
-    ILegendre n = moment_to_coeff(nm);
-    IZone zone = mixed_to_zones(mix);
-    IMaterial material = mixed_material(mix);
-    double fraction = mixed_fraction(mix);
-
-    phi_out(nm, gp, zone) +=
-      sigs(n, g, gp, material) * phi(nm, g, zone) * fraction;
-  }
-
-};
-
-
-
-
-template<typename PHI, typename MZ, typename MM, typename MF>
-struct SourceFcn {
-
-  PHI  phi_out;
-
-  MZ   mixed_to_zones;
-  MM   mixed_material;
-  MF   mixed_fraction;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  SourceFcn(PHI phi_out_, MZ mz, MM mm, MF mf) :
-    phi_out(phi_out_), 
-    mixed_to_zones(mz),
-    mixed_material(mm),
-    mixed_fraction(mf)
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void operator()(IGlobalGroup g, IMix mix) const {
-    IZone zone = mixed_to_zones(mix);
-    IMaterial material = mixed_material(mix);
-    double fraction = mixed_fraction(mix);
-
-    if(*material == 0){
-      phi_out(IMoment(0), g, zone) += 1.0 * fraction;
-    }
-  }
-  
-};
-
-template<typename DIR, typename PSI, typename SIGT, typename DX, typename DY, typename DZ,
-  typename ZONE_LAYOUT, typename FACEI, typename FACEJ, typename FACEK,
-  typename IDXI, typename IDXJ, typename IDXK>
-struct SweepFcn {
-
-  DIR direction;
-  PSI rhs;
-  PSI psi;
-  SIGT sigt;
-  DX dx;
-  DY dy;
-  DZ dz;
-  ZONE_LAYOUT zone_layout;
-  FACEI face_lf;
-  FACEJ face_fr;
-  FACEK face_bo;
-  IDXI idx_to_i;
-  IDXJ idx_to_j;
-  IDXK idx_to_k;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  SweepFcn(DIR direction_, PSI rhs_, PSI psi_, SIGT sigt_, DX dx_, DY dy_, DZ dz_,
-      ZONE_LAYOUT zone_layout_, FACEI face_lf_, FACEJ face_fr_, FACEK face_bo_,
-      IDXI idx_to_i_, IDXJ idx_to_j_, IDXK idx_to_k_) :
-    direction(direction_),
-    rhs(rhs_),
-    psi(psi_),
-    sigt(sigt_),
-    dx(dx_),
-    dy(dy_),
-    dz(dz_),
-    zone_layout(zone_layout_),
-    face_lf(face_lf_),
-    face_fr(face_fr_),
-    face_bo(face_bo_),
-    idx_to_i(idx_to_i_),
-    idx_to_j(idx_to_j_),
-    idx_to_k(idx_to_k_)
-  {}
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void operator()(IDirection d, IGroup g, IZoneIdx zone_idx) const {
-    IZoneI i = idx_to_i(zone_idx);
-    IZoneJ j = idx_to_j(zone_idx);
-    IZoneK k = idx_to_k(zone_idx);
-
-    double const xcos_dxi = 2.0 * direction(d).xcos / dx(i+1);
-    double const ycos_dyj = 2.0 * direction(d).ycos / dy(j+1);
-    double const zcos_dzk = 2.0 * direction(d).zcos / dz(k+1);
-
-    IZone z = zone_layout(i,j,k);
-
-    // Calculate new zonal flux
-    double const psi_d_g_z = (
-          rhs(d,g,z)
-        + face_lf(d,g,j,k) * xcos_dxi
-        + face_fr(d,g,i,k) * ycos_dyj
-        + face_bo(d,g,i,j) * zcos_dzk)
-        / (xcos_dxi + ycos_dyj + zcos_dzk + sigt(g,z) );
-
-    psi(d,g,z) = psi_d_g_z;
-
-    // Apply diamond-difference relationships
-    face_lf(d,g,j,k) = 2.0 * psi_d_g_z - face_lf(d,g,j,k);
-    face_fr(d,g,i,k) = 2.0 * psi_d_g_z - face_fr(d,g,i,k);
-    face_bo(d,g,i,j) = 2.0 * psi_d_g_z - face_bo(d,g,i,j);
-  }
-};
-
-
-template<typename REDUCE, typename DIR, typename PSI, typename VOL>
-struct ParticleEditFcn {
-
-  REDUCE &part_reduce;
-  DIR direction;
-  PSI psi;
-  VOL volume;
-
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  ParticleEditFcn(REDUCE &reduce_, DIR &direction_, PSI psi_, VOL vol_) :
-    part_reduce(reduce_),
-    direction(direction_),
-    psi(psi_),
-    volume(vol_)
-  {}
-
-
-#pragma nv_exec_check_disable
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  void operator()(IDirection d, IGroup g, IZone z) const {
-    part_reduce += direction(d).w * psi(d,g,z) * volume(z);              
-  }
-  
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.cpp
deleted file mode 100644
index b0176393b..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Layout.h>
-
-#include<Kripke/Input_Variables.h>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-namespace {
-  /*
-    The following 2 routines are used to map:
-      1) mpi ranks to/from processors in x,y,z
-      2) zoneset ids to/from zoneset in x,y,z
-  */
-
-  /**
-    Helper routine to take an index, and return a 3-dimensional set of indices,
-    given size of each index dimension.
-  */
-  inline void rankToIndices(int rank, int *indices, int const *sizes){
-    indices[0] = rank / (sizes[1]*sizes[2]);
-    rank = rank % (sizes[1]*sizes[2]);
-    indices[1] = rank / sizes[2];
-    indices[2] = rank % sizes[2];
-  }
-
-  /**
-    Helper routine to take an index, and return a 3-dimensional set of indices,
-    given size of each index dimension.
-  */
-  inline int indicesToRank(int const *indices, int const *sizes){
-    int rank;
-
-    rank =  indices[0]*(sizes[1]*sizes[2]);
-    rank += indices[1]*sizes[2];
-    rank += indices[2];
-
-    return rank;
-  }
-}
-
-Layout::Layout(Input_Variables *input_vars){
-  num_group_sets = input_vars->num_groupsets;
-  num_direction_sets = input_vars->num_dirsets;
-  num_zone_sets = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    num_zone_sets_dim[dim] = input_vars->num_zonesets_dim[dim];
-    num_zone_sets *= input_vars->num_zonesets_dim[dim];
-  }
-
-  // grab total number of zones
-  total_zones[0] = input_vars->nx;
-  total_zones[1] = input_vars->ny;
-  total_zones[2] = input_vars->nz;
-
-  // Grab size of processor grid
-  num_procs[0] = input_vars->npx;
-  num_procs[1] = input_vars->npy;
-  num_procs[2] = input_vars->npz;
-
-  /* Set the requested processor grid size */
-  int R = num_procs[0] * num_procs[1] * num_procs[2];
-
-  /* Check requested size is the same as MPI_COMM_WORLD */
-  int size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-#endif
-  if(R != size){
-    int myid=0;
-#ifdef KRIPKE_USE_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-    if(myid == 0){
-      KripkeAbort("ERROR: Incorrect number of MPI tasks. Need %d MPI tasks.", R);
-    }
-  }
-
-  /* Compute the local coordinates in the processor decomposition */
-  int mpi_rank = 0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-#endif
-  rankToIndices(mpi_rank, our_rank, num_procs);
-}
-Layout::~Layout(){
-
-}
-
-/**
-  Computes the subdomain ID based on a given groupset, directionset, and zoneset.
-*/
-int Layout::setIdToSubdomainId(int gs, int ds, int zs) const{
-  int indices[3] = {gs, ds, zs};
-  int sizes[3] = {num_group_sets, num_direction_sets, num_zone_sets};
-
-  return indicesToRank(indices, sizes);
-}
-
-/**
-  Computes groupset, directionset, and zoneset from a subdomain ID.
-*/
-void Layout::subdomainIdToSetId(int sdom_id, int &gs, int &ds, int &zs) const {
-  int indices[3];
-  int sizes[3] = {num_group_sets, num_direction_sets, num_zone_sets};
-
-  rankToIndices(sdom_id, indices, sizes);
-
-  gs = indices[0];
-  ds = indices[1];
-  zs = indices[2];
-}
-
-/**
-  Computes the zoneset id along a particular dimension.
-*/
-int Layout::subdomainIdToZoneSetDim(int sdom_id, int dim) const{
-  // Compute zoneset
-  int gs, ds, zs;
-  subdomainIdToSetId(sdom_id, gs, ds, zs);
-
-  // Compute zone set
-  int zs_dim[3];
-  rankToIndices(zs, zs_dim, num_zone_sets_dim);
-
-  return zs_dim[dim];
-}
-
-/**
-  Computes the number of zones in this subdomain, along specified dimension.
-*/
-int Layout::getNumZones(int sdom_id, int dim) const{
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  int total_subdomains = num_procs[dim] * num_zone_sets_dim[dim];
-  int global_subdomain  = num_zone_sets_dim[dim] * our_rank[dim] + zs_dim;
-
-  // Compute subset of global zone indices
-  int num_zones = total_zones[dim] / total_subdomains;
-  int rem = total_zones[dim] % total_subdomains;
-  if(rem != 0 && global_subdomain < rem){
-    num_zones ++;
-  }
-
-  return num_zones;
-}
-
-
-
-
-
-
-BlockLayout::BlockLayout(Input_Variables *input_vars) :
-  Layout(input_vars)
-{
-
-}
-BlockLayout::~BlockLayout(){
-
-}
-
-Neighbor BlockLayout::getNeighbor(int our_sdom_id, int dim, int dir) const{
-  Neighbor n;
-
-  // get our processor indices, so we can find neighbors
-  int proc[3] = {our_rank[0], our_rank[1], our_rank[2]};
-
-  int gs, ds, zs;
-  subdomainIdToSetId(our_sdom_id, gs, ds, zs);
-
-  // Compute out spatial subdomain indices
-  int zs_dim[3];
-  for(int d = 0;d < 3;++ d){
-    zs_dim[d] = subdomainIdToZoneSetDim(our_sdom_id, d);
-  }
-
-  // Offest along dir,dim to get neighboring indices
-  zs_dim[dim] += dir;
-
-  // Check if the neighbor is remote, and wrap zoneset indices
-  if(zs_dim[dim] >= num_zone_sets_dim[dim]){
-    zs_dim[dim] = 0;
-    proc[dim] += dir;
-  }
-  else if(zs_dim[dim] < 0){
-    zs_dim[dim] = num_zone_sets_dim[dim]-1;
-    proc[dim] += dir;
-  }
-
-  // Compute the mpi rank of the neighbor
-  if(proc[dim] < 0 || proc[dim] >= num_procs[dim]){
-    // we hit a boundary condition
-    n.mpi_rank = -1;
-    n.subdomain_id = -1;
-  }
-  else{
-    // There is a neighbor, so compute its rank
-    n.mpi_rank = indicesToRank(proc, num_procs);
-
-    // Compute neighboring subdomain id
-    zs = indicesToRank(zs_dim, num_zone_sets_dim);
-    n.subdomain_id = setIdToSubdomainId(gs, ds, zs);
-  }
-
-  return n;
-}
-
-/**
-  Compute the spatial extents of a subdomain along a given dimension.
-*/
-std::pair<double, double> BlockLayout::getSpatialExtents(int sdom_id, int dim) const{
-
-  // Start with global problem dimensions
-  std::pair<double, double> ext_global(-60.0, 60.0);
-  if(dim == 1){
-    ext_global.first = -100.0;
-    ext_global.second = 100.0;
-  }
-
-  // Subdivide by number of processors in specified dimension
-  double dx = (ext_global.second - ext_global.first) / (double)num_procs[dim];
-  std::pair<double, double> ext_proc(
-    ext_global.first + dx*(double)our_rank[dim],
-    ext_global.first + dx*(double)(our_rank[dim] + 1)
-  );
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  // Subdivide by number of subdomains in specified dimension
-  double sdx = (ext_proc.second - ext_proc.first) / (double)num_zone_sets_dim[dim];
-  std::pair<double, double> ext_sdom(
-    ext_proc.first + sdx*(double)zs_dim,
-    ext_proc.first + sdx*(double)(zs_dim + 1)
-  );
-
-  return ext_sdom;
-}
-
-
-
-ScatterLayout::ScatterLayout(Input_Variables *input_vars) :
-  Layout(input_vars)
-{
-
-}
-ScatterLayout::~ScatterLayout(){
-
-}
-
-Neighbor ScatterLayout::getNeighbor(int our_sdom_id, int dim, int dir) const{
-  Neighbor n;
-
-  // get our processor indices, so we can find neighbors
-  int proc[3] = {our_rank[0], our_rank[1], our_rank[2]};
-
-  int gs, ds, zs;
-  subdomainIdToSetId(our_sdom_id, gs, ds, zs);
-
-  // Compute our spatial subdomain indices
-  int zs_dim[3];
-  for(int d = 0;d < 3;++ d){
-    zs_dim[d] = subdomainIdToZoneSetDim(our_sdom_id, d);
-  }
-
-  // Offest along dir,dim to get neighboring subdomain indices
-  proc[dim] += dir;
-
-  // Check if we wrapped mpi ranks, and should bump zoneset indices
-  if(proc[dim] >= num_procs[dim]){
-    proc[dim] = 0;
-    zs_dim[dim] += dir;
-  }
-  else if(proc[dim] < 0){
-    proc[dim] = num_procs[dim]-1;
-    zs_dim[dim] += dir;
-  }
-
-  // Compute zone set indices, and detect boundary condition
-  if(zs_dim[dim] < 0 || zs_dim[dim] >= num_zone_sets_dim[dim]){
-    // we hit a boundary condition
-    n.mpi_rank = -1;
-    n.subdomain_id = -1;
-
-  }
-  else{
-    // There is a neighbor, so compute its rank
-    n.mpi_rank = indicesToRank(proc, num_procs);
-
-    // Compute neighboring subdomain id
-    zs = indicesToRank(zs_dim, num_zone_sets_dim);
-    n.subdomain_id = setIdToSubdomainId(gs, ds, zs);
-  }
-
-
-  return n;
-}
-
-/**
-  Compute the spatial extents of a subdomain along a given dimension.
-*/
-std::pair<double, double> ScatterLayout::getSpatialExtents(int sdom_id, int dim) const{
-
-  // Start with global problem dimensions
-  std::pair<double, double> ext_global(-60.0, 60.0);
-  if(dim == 1){
-    ext_global.first = -100.0;
-    ext_global.second = 100.0;
-  }
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  // Subdivide by number of subdomains in specified dimension
-  double sdx = (ext_global.second - ext_global.first) / (double)num_zone_sets_dim[dim];
-  std::pair<double, double> ext_sdom(
-    ext_global.first + sdx*(double)zs_dim,
-    ext_global.first + sdx*(double)(zs_dim + 1)
-  );
-
-  // Subdivide by number of processors in specified dimension
-  double dx = (ext_sdom.second - ext_sdom.first) / (double)num_procs[dim];
-  std::pair<double, double> ext_proc(
-    ext_sdom.first + dx*(double)our_rank[dim],
-    ext_sdom.first + dx*(double)(our_rank[dim] + 1)
-  );
-
-
-  return ext_proc;
-}
-
-
-/**
-  Factory to create Layout object based on user defined inputs
-*/
-Layout *createLayout(Input_Variables *input_vars){
-  switch(input_vars->layout_pattern){
-    case 0:
-      return new BlockLayout(input_vars);
-    case 1:
-      return new ScatterLayout(input_vars);
-  }
-  KripkeAbort("Unknown Layout patter\n");
-  return NULL;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.h
deleted file mode 100644
index 1794c6970..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Layout.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_LAYOUT_H__
-#define KRIPKE_LAYOUT_H__
-
-#include<algorithm>
-
-// foreward decl
-struct Input_Variables;
-
-/**
-  Describes a neighboring Subdomain using both mpi-rank and subdomin id
-*/
-struct Neighbor{
-  int mpi_rank;     // Neighbors MPI rank, or -1 for boundary condition
-  int subdomain_id; // Subdomain ID of neighbor
-};
-
-
-
-/**
-   Describes relationships between MPI-ranks and subdomains.
-   This is an interface, allowing different layout schemes to be implemented as derived types.
- */
-class Layout {
-  public:
-    explicit Layout(Input_Variables *input_vars);
-    virtual ~Layout();
-
-    virtual int setIdToSubdomainId(int gs, int ds, int zs) const;
-    virtual int subdomainIdToZoneSetDim(int sdom_id, int dim) const;
-    virtual void subdomainIdToSetId(int sdom_id, int &gs, int &ds, int &zs) const;
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const = 0;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const = 0;
-    virtual int getNumZones(int sdom_id, int dim) const;
-
-  protected:
-    int num_group_sets;      // Number of group sets
-    int num_direction_sets;  // Number of direction sets
-    int num_zone_sets;       // Number of zone sets
-    int num_zone_sets_dim[3];// Number of zone sets in each dimension
-
-    int total_zones[3];      // Total number of zones in each dimension
-
-    int num_procs[3];        // Number of MPI ranks in each dimensions
-    int our_rank[3];         // Our mpi indices in xyz
-};
-
-class BlockLayout : public Layout {
-  public:
-    explicit BlockLayout(Input_Variables *input_vars);
-    virtual ~BlockLayout();
-
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const;
-};
-
-class ScatterLayout : public Layout {
-  public:
-    explicit ScatterLayout(Input_Variables *input_vars);
-    virtual ~ScatterLayout();
-
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const;
-};
-
-
-// Factory to create layout object
-Layout *createLayout(Input_Variables *input_vars);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.cpp
deleted file mode 100644
index dbc71af1a..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/ParallelComm.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/SubTVec.h>
-
-
-ParallelComm::ParallelComm(Grid_Data *grid_data_ptr) :
-  grid_data(grid_data_ptr)
-{
-
-}
-
-ParallelComm::~ParallelComm(){
-
-}
-
-int ParallelComm::computeTag(int mpi_rank, int sdom_id){
-  int mpi_size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  int tag = mpi_rank + mpi_size*sdom_id;
-
-  return tag;
-}
-
-void ParallelComm::computeRankSdom(int tag, int &mpi_rank, int &sdom_id){
-  int mpi_size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  mpi_rank = tag % mpi_size;
-  sdom_id = tag / mpi_size;
-}
-
-/**
-  Finds subdomain in the queue by its subdomain id.
-*/
-int ParallelComm::findSubdomain(int sdom_id){
-
-  // find subdomain in queue
-  int index;
-  for(index = 0;index < queue_sdom_ids.size();++ index){
-    if(queue_sdom_ids[index] == sdom_id){
-      break;
-    }
-  }
-  if(index == queue_sdom_ids.size()){
-    KripkeAbort("Cannot find subdomain id %d in work queue\n", sdom_id);
-  }
-
-  return index;
-}
-
-
-Subdomain *ParallelComm::dequeueSubdomain(int sdom_id){
-  int index = findSubdomain(sdom_id);
-
-  // Get subdomain pointer before removing it from queue
-  Subdomain *sdom = queue_subdomains[index];
-
-  // remove subdomain from queue
-  queue_sdom_ids.erase(queue_sdom_ids.begin()+index);
-  queue_subdomains.erase(queue_subdomains.begin()+index);
-  queue_depends.erase(queue_depends.begin()+index);
-
-  return sdom;
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-  All recieves use the plane_data[] arrays as recieve buffers.
-*/
-void ParallelComm::postRecvs(int sdom_id, Subdomain &sdom){
-  int mpi_rank=0;
-#ifdef KRIPKE_USE_MPI
-  int mpi_size=1;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  // go thru each dimensions upwind neighbors, and add the dependencies
-  int num_depends = 0;
-  for(int dim = 0;dim < 3;++ dim){
-    // If it's a boundary condition, skip it
-    if(sdom.upwind[dim].mpi_rank < 0){
-      continue;
-    }
-
-    // If it's an on-rank communication (from another subdomain)
-    if(sdom.upwind[dim].mpi_rank == mpi_rank){
-      // skip it, but track the dependency
-      num_depends ++;
-      continue;
-    }
-
-#ifdef KRIPKE_USE_MPI
-    // Add request to pending list
-    recv_requests.push_back(MPI_Request());
-    recv_subdomains.push_back(sdom_id);
-
-    // compute the tag id of THIS subdomain (tags are always based on destination)
-    int tag = computeTag(sdom.upwind[dim].mpi_rank, sdom.upwind[dim].subdomain_id);
-
-    // Post the recieve
-    MPI_Irecv(sdom.plane_data[dim]->ptr(), sdom.plane_data[dim]->elements, MPI_DOUBLE, sdom.upwind[dim].mpi_rank,
-      tag, MPI_COMM_WORLD, &recv_requests[recv_requests.size()-1]);
-
-    // increment number of dependencies
-    num_depends ++;
-#endif
-  }
-
-  // add subdomain to queue
-  queue_sdom_ids.push_back(sdom_id);
-  queue_subdomains.push_back(&sdom);
-  queue_depends.push_back(num_depends);
-}
-
-void ParallelComm::postSends(Subdomain *sdom, double *src_buffers[3]){
-  // post sends for downwind dependencies
-  int mpi_rank=0;
-#ifdef KRIPKE_USE_MPI
-  int mpi_size=1;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-  for(int dim = 0;dim < 3;++ dim){
-    // If it's a boundary condition, skip it
-    if(sdom->downwind[dim].mpi_rank < 0){
-      continue;
-    }
-
-    // If it's an on-rank communication (to another subdomain)
-    if(sdom->downwind[dim].mpi_rank == mpi_rank){
-      // find the local subdomain in the queue, and decrement the counter
-      for(int i = 0;i < queue_sdom_ids.size();++ i){
-        if(queue_sdom_ids[i] == sdom->downwind[dim].subdomain_id){
-          queue_depends[i] --;
-          break;
-        }
-      }
-
-      // copy the boundary condition data into the downwinds plane data
-      Subdomain &sdom_downwind = grid_data->subdomains[sdom->downwind[dim].subdomain_id];
-      sdom_downwind.plane_data[dim]->copy(*sdom->plane_data[dim]);
-      int num_elem = sdom_downwind.plane_data[dim]->elements;
-      //double const * KRESTRICT src_ptr = sdom->plane_data[dim]->ptr();
-      double * KRESTRICT dst_ptr = sdom_downwind.plane_data[dim]->ptr();
-      for(int i = 0;i < num_elem;++ i){
-        dst_ptr[i] = src_buffers[dim][i];
-      }
-      continue;
-    }
-#ifdef KRIPKE_USE_MPI
-    // At this point, we know that we have to send an MPI message
-    // Add request to send queue
-    send_requests.push_back(MPI_Request());
-
-    // compute the tag id of TARGET subdomain (tags are always based on destination)
-    int tag = computeTag(mpi_rank, sdom->downwind[dim].subdomain_id);
-
-    // Post the send
-    MPI_Isend(src_buffers[dim], sdom->plane_data[dim]->elements, MPI_DOUBLE, sdom->downwind[dim].mpi_rank,
-      tag, MPI_COMM_WORLD, &send_requests[send_requests.size()-1]);
-#endif
-  }
-}
-
-
-// Checks if there are any outstanding subdomains to complete
-bool ParallelComm::workRemaining(void){
-#ifdef KRIPKE_USE_MPI
-  return (recv_requests.size() > 0 || queue_subdomains.size() > 0);
-#else
-  return (queue_subdomains.size() > 0);
-#endif
-}
-
-
-// Blocks until all sends have completed, and flushes the send queues
-void ParallelComm::waitAllSends(void){
-#ifdef KRIPKE_USE_MPI
-  // Wait for all remaining sends to complete, then return false
-  int num_sends = send_requests.size();
-  if(num_sends > 0){
-    std::vector<MPI_Status> status(num_sends);
-    MPI_Waitall(num_sends, &send_requests[0], &status[0]);
-    send_requests.clear();
-  }
-#endif
-}
-
-/**
-  Checks for incomming messages, and does relevant bookkeeping.
-*/
-void ParallelComm::testRecieves(void){
-
-#ifdef KRIPKE_USE_MPI
-  // Check for any recv requests that have completed
-  int num_requests = recv_requests.size();
-  bool done = false;
-  while(!done && num_requests > 0){
-    // Create array of status variables
-    std::vector<MPI_Status> recv_status(num_requests);
-
-    // Ask if either one or none of the recvs have completed?
-    int index; // this will be the index of request that completed
-    int complete_flag; // this is set to TRUE if somthing completed
-    MPI_Testany(num_requests, &recv_requests[0], &index, &complete_flag, &recv_status[0]);
-
-    if(complete_flag != 0){
-
-      // get subdomain that this completed for
-      int sdom_id = recv_subdomains[index];
-
-      // remove the request from the list
-      recv_requests.erase(recv_requests.begin()+index);
-      recv_subdomains.erase(recv_subdomains.begin()+index);
-      num_requests --;
-
-      // decrement the dependency count for that subdomain
-      for(int i = 0;i < queue_sdom_ids.size();++ i){
-        if(queue_sdom_ids[i] == sdom_id){
-          queue_depends[i] --;
-          break;
-        }
-      }
-    }
-    else{
-      done = true;
-    }
-  }
-#endif
-}
-
-
-std::vector<int> ParallelComm::getReadyList(void){
-  // build up a list of ready subdomains
-  std::vector<int> ready;
-  for(int i = 0;i < queue_depends.size();++ i){
-    if(queue_depends[i] == 0){
-      ready.push_back(queue_sdom_ids[i]);
-    }
-  }
-  return ready;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.h
deleted file mode 100644
index 7d7f40254..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_COMM_H__
-#define KRIPKE_COMM_H__
-
-#include<vector>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-struct Grid_Data;
-struct Subdomain;
-
-class ParallelComm {
-  public:
-    explicit ParallelComm(Grid_Data *grid_data_ptr);
-    virtual ~ParallelComm();
-
-    // Adds a subdomain to the work queue
-    virtual void addSubdomain(int sdom_id, Subdomain &sdom) = 0;
-
-    // Checks if there are any outstanding subdomains to complete
-    // false indicates all work is done, and all sends have completed
-    virtual bool workRemaining(void);
-
-    // Returns a vector of ready subdomains, and clears them from the ready queue
-    virtual std::vector<int> readySubdomains(void) = 0;
-
-    // Marks subdomains as complete, and performs downwind communication
-    virtual void markComplete(int sdom_id) = 0;
-
-  protected:
-    static int computeTag(int mpi_rank, int sdom_id);
-    static void computeRankSdom(int tag, int &mpi_rank, int &sdom_id);
-
-    int findSubdomain(int sdom_id);
-    Subdomain *dequeueSubdomain(int sdom_id);
-    void postRecvs(int sdom_id, Subdomain &sdom);
-    void postSends(Subdomain *sdom, double *buffers[3]);
-    void testRecieves(void);
-    void waitAllSends(void);
-    std::vector<int> getReadyList(void);
-
-
-    Grid_Data *grid_data;
-
-    // These vectors contian the recieve requests
-#ifdef KRIPKE_USE_MPI
-    std::vector<MPI_Request> recv_requests;
-#endif
-    std::vector<int> recv_subdomains;
-
-    // These vectors have the subdomains, and the remaining dependencies
-    std::vector<int> queue_sdom_ids;
-    std::vector<Subdomain *> queue_subdomains;
-    std::vector<int> queue_depends;
-
-    // These vectors have the remaining send requests that are incomplete
-#ifdef KRIPKE_USE_MPI
-    std::vector<MPI_Request> send_requests;
-#endif
-};
-
-
-class SweepComm : public ParallelComm {
-  public:
-    explicit SweepComm(Grid_Data *data);
-    virtual ~SweepComm();
-
-    virtual void addSubdomain(int sdom_id, Subdomain &sdom);
-    virtual bool workRemaining(void);
-    virtual std::vector<int> readySubdomains(void);
-    virtual void markComplete(int sdom_id);
-};
-
-
-class BlockJacobiComm : public ParallelComm {
-  public:
-    explicit BlockJacobiComm(Grid_Data *data);
-    virtual ~BlockJacobiComm();
-
-    void addSubdomain(int sdom_id, Subdomain &sdom);
-    bool workRemaining(void);
-    std::vector<int> readySubdomains(void);
-    void markComplete(int sdom_id);
-
-  private:
-    bool posted_sends;
-};
-
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/BlockJacobiComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/BlockJacobiComm.cpp
deleted file mode 100644
index 8dd48cb86..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/BlockJacobiComm.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/ParallelComm.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Grid.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <vector>
-#include <stdio.h>
-
-
-BlockJacobiComm::BlockJacobiComm(Grid_Data *data) : ParallelComm(data), posted_sends(false)
-{
-
-}
-
-BlockJacobiComm::~BlockJacobiComm(){
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-*/
-void BlockJacobiComm::addSubdomain(int sdom_id, Subdomain &sdom){
-  // Copy old flux data to send buffers
-  for(int dim = 0;dim < 3;++ dim){
-    int nelem = sdom.plane_data[dim]->elements;
-    double const * KRESTRICT src = sdom.plane_data[dim]->ptr();
-    double * KRESTRICT dst = sdom.old_plane_data[dim]->ptr();
-    for(int i = 0;i < nelem;++ i){
-      dst[i] = src[i];
-    }
-  }
-
-  // post recieves
-  postRecvs(sdom_id, sdom);
-
-}
-
-// Checks if there are any outstanding subdomains to complete
-// false indicates all work is done, and all sends have completed
-bool BlockJacobiComm::workRemaining(void){
-  if(!posted_sends){
-    // post sends for all queued subdomains
-    for(int i = 0;i < queue_subdomains.size();++ i){
-      Subdomain *sdom = queue_subdomains[i];
-
-      // Send new downwind info for sweep
-      double *buf[3] = {
-        sdom->old_plane_data[0]->ptr(),
-        sdom->old_plane_data[1]->ptr(),
-        sdom->old_plane_data[2]->ptr()
-      };
-
-      postSends(sdom, buf);
-    }
-    posted_sends = true;
-  }
-  // Since we communicate fluxes before local sweeps, when we are
-  // out of work, there is no further synchronization
-  if(ParallelComm::workRemaining()){
-    return true;
-  }
-  waitAllSends();
-
-  return false;
-}
-
-/**
-  Checks for incomming messages, and returns a list of ready subdomain id's
-*/
-std::vector<int> BlockJacobiComm::readySubdomains(void){
-  testRecieves();
-
-  // return list of any ready subdomains
-  return getReadyList();
-}
-
-
-
-void BlockJacobiComm::markComplete(int sdom_id){
-  // remove subdomain from work queue
-  dequeueSubdomain(sdom_id);
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/SweepComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/SweepComm.cpp
deleted file mode 100644
index 934276008..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/ParallelComm/SweepComm.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/ParallelComm.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Grid.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <vector>
-#include <stdio.h>
-
-
-SweepComm::SweepComm(Grid_Data *data) : ParallelComm(data)
-{
-
-}
-
-SweepComm::~SweepComm(){
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-*/
-void SweepComm::addSubdomain(int sdom_id, Subdomain &sdom){
-  // Post recieves for upwind dependencies, and add to the queue
-  postRecvs(sdom_id, sdom);
-}
-
-
-// Checks if there are any outstanding subdomains to complete
-// false indicates all work is done, and all sends have completed
-bool SweepComm::workRemaining(void){
-  // If there are outstanding subdomains to process, return true
-  if(ParallelComm::workRemaining()){
-    return true;
-  }
-
-  // No more work, so make sure all of our sends have completed
-  // before we continue
-  waitAllSends();
-
-  return false;
-}
-
-
-/**
-  Checks for incomming messages, and returns a list of ready subdomain id's
-*/
-std::vector<int> SweepComm::readySubdomains(void){
-  // check for incomming messages
-  testRecieves();
-
-  // build up a list of ready subdomains
-  return getReadyList();
-}
-
-
-void SweepComm::markComplete(int sdom_id){
-  // Get subdomain pointer and remove from work queue
-  Subdomain *sdom = dequeueSubdomain(sdom_id);
-
-  // Send new downwind info for sweep
-  double *buf[3] = {
-    sdom->plane_data[0]->ptr(),
-    sdom->plane_data[1]->ptr(),
-    sdom->plane_data[2]->ptr()
-  };
-  postSends(sdom, buf);
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/SubTVec.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/SubTVec.h
deleted file mode 100644
index eb3ca9242..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/SubTVec.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_SUBTVEC_H__
-#define KRIPKE_SUBTVEC_H__
-
-#define KRIPKE_ALIGN_DATA
-
-#define KRIPKE_ALIGN 64
-
-#include <Kripke/Kernel.h>
-#include <algorithm>
-#include <vector>
-#include <stdlib.h>
-
-/**
- *  A transport vector (used for Psi and Phi, RHS, etc.)
- *
- *  This provides the inner most three strides of
- *    Psi[GS][DS][G][D][Z]
- *  but in whatever nesting order is specified.
- */
-struct SubTVec {
-private:
-  // disallow
-  SubTVec(SubTVec const &c);
-  SubTVec &operator=(SubTVec const &c);
-
-public:
-  SubTVec(Nesting_Order nesting, int ngrps, int ndir_mom, int nzones):
-    groups(ngrps),
-    directions(ndir_mom),
-    zones(nzones),
-    elements(groups*directions*zones),
-    data_linear(NULL)
-  {
-//#ifdef RAJA_ENABLE_CUDA
-    
-#ifdef KRIPKE_ALIGN_DATA
-    int status = posix_memalign((void**)&data_linear, KRIPKE_ALIGN, sizeof(double)*elements);
-    if(status != 0){
-    	printf("Error allocating data\n");
-    	data_linear = NULL;
-    }
-#else
-    data_linear = (double *) malloc(sizeof(double)*elements);
-#endif // align
-//#endif // cuda
-    setupIndices(nesting, data_linear);
-  }
-
-
-  /**
-   * ALIASING version of constructor.
-   * Use this when you have a data buffer already, and don't want this class
-   * to do any memory management.
-   */
-  SubTVec(Nesting_Order nesting, int ngrps, int ndir_mom, int nzones, double *ptr):
-    groups(ngrps),
-    directions(ndir_mom),
-    zones(nzones),
-    elements(groups*directions*zones),
-    data_linear(NULL)
-  {
-    setupIndices(nesting, ptr);
-  }
-
-  ~SubTVec(){
-    if(data_linear != NULL){
-      free(data_linear);
-    }
-  }
-
-  void setupIndices(Nesting_Order nesting, double *ptr){
-    // setup nesting order
-    switch(nesting){
-      case NEST_GDZ:
-        ext_to_int[0] = 0;
-        ext_to_int[1] = 1;
-        ext_to_int[2] = 2;
-        break;
-      case NEST_GZD:
-        ext_to_int[0] = 0;
-        ext_to_int[2] = 1;
-        ext_to_int[1] = 2;
-        break;
-      case NEST_DZG:
-        ext_to_int[1] = 0;
-        ext_to_int[2] = 1;
-        ext_to_int[0] = 2;
-        break;
-      case NEST_DGZ:
-        ext_to_int[1] = 0;
-        ext_to_int[0] = 1;
-        ext_to_int[2] = 2;
-        break;
-      case NEST_ZDG:
-        ext_to_int[2] = 0;
-        ext_to_int[1] = 1;
-        ext_to_int[0] = 2;
-        break;
-      case NEST_ZGD:
-        ext_to_int[2] = 0;
-        ext_to_int[0] = 1;
-        ext_to_int[1] = 2;
-        break;
-    }
-
-    // setup dimensionality
-    int size_ext[3];
-    size_ext[0] = groups;
-    size_ext[1] = directions;
-    size_ext[2] = zones;
-
-    // map to internal indices
-    for(int i = 0; i < 3; ++i){
-      size_int[ext_to_int[i]] = size_ext[i];
-    }
-
-    data_pointer = ptr;
-  }
-
-  inline double* ptr(void){
-    return data_pointer;
-  }
-
-  inline double* ptr(int g, int d, int z){
-    return &(*this)(g,d,z);
-  }
-
-  // These are NOT efficient.. just used to re-stride data for comparisons
-  inline double &operator()(int g, int d, int z) {
-    int idx[3];
-    idx[ext_to_int[0]] = g;
-    idx[ext_to_int[1]] = d;
-    idx[ext_to_int[2]] = z;
-    int offset = idx[0] * size_int[1]*size_int[2] +
-                 idx[1] * size_int[2] +
-                 idx[2];
-    return data_pointer[offset];
-  }
-  inline double operator()(int g, int d, int z) const {
-    return (*const_cast<SubTVec*>(this))(g,d,z);
-  }
-
-  inline double sum(void) const {
-    double s = 0.0;
-    for(size_t i = 0;i < elements;++ i){
-      s+= data_linear[i];
-    }
-    return s;
-  }
-
-  inline void clear(double v){
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int i = 0;i < elements;++ i){
-      data_linear[i] = v;
-    }
-  }
-
-  inline void randomizeData(void){
-    for(int i = 0;i < elements;++ i){
-      data_linear[i] = drand48();
-    }
-  }
-
-  inline void copy(SubTVec const &b){
-    for(int g = 0;g < groups;++ g){
-      for(int d = 0;d < directions; ++ d){
-        for(int z = 0;z < zones;++ z){
-          // Copy using abstract indexing
-          (*this)(g,d,z) = b(g,d,z);
-        }
-      }
-    }
-  }
-
-  inline bool compare(std::string const &name, SubTVec const &b,
-      double tol, bool verbose){
-
-    bool is_diff = false;
-    int num_wrong = 0;
-    for(int g = 0;g < groups;++ g){
-      for(int d = 0;d < directions; ++ d){
-        for(int z = 0;z < zones;++ z){
-          // Copy using abstract indexing
-          double err = std::abs((*this)(g,d,z) - b(g,d,z));
-          if(err > tol){
-            is_diff = true;
-            if(verbose){
-              printf("%s[g=%d, d=%d, z=%d]: |%e - %e| = %e\n",
-                  name.c_str(), g,d,z, (*this)(g,d,z), b(g,d,z), err);
-              num_wrong ++;
-              if(num_wrong > 100){
-                return true;
-              }
-            }
-          }
-        }
-      }
-    }
-    return is_diff;
-  }
-
-  int ext_to_int[3]; // external index to internal index mapping
-  int size_int[3]; // size of each dimension in internal indices
-
-  int groups, directions, zones, elements;
-  double *data_pointer;
-  double *data_linear;
-};
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.cpp
deleted file mode 100644
index c92303c1d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.cpp
+++ /dev/null
@@ -1,598 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Input_Variables.h>
-#include <Kripke/Kernel/DataPolicy.h>
-
-#include <cmath>
-#include <sstream>
-
-
-namespace {
-  /**
-    This function defined the material distribution in space.
-    This defines Problem 3 from Kobayashi
-    Where Region 1 is material 0, 2 is 1 and 3 is 2.
-  */
-  inline int queryMaterial(double x, double y, double z){
-    // Problem is defined for one octant, with reflecting boundaries
-    // We "unreflect" it here by taking abs values
-    x = std::abs(x);
-    y = std::abs(y);
-    z = std::abs(z);
-
-    // Central 20x20x20 box is Region 1
-    if(x <= 10.0 && y <= 10.0 && z <= 10.0){
-      return 0;
-    }
-
-    // Leg 1 of Region 2
-    if(x <= 10.0 && y <= 60.0 && z <= 10.0){
-      return 1;
-    }
-
-    // Leg 2 of Region 2
-    if(x <= 40.0 && y >= 50.0 && y <= 60.0 && z <= 10.0){
-      return 1;
-    }
-
-    // Leg 3 of Region 2
-    if(x >= 30.0 && x <= 40.0 && y >= 50.0 && y <= 60.0 && z <= 40.0){
-      return 1;
-    }
-
-    // Leg 4 of Region 2
-    if(x >= 30.0 && x <= 40.0 && y >= 50.0 && z >= 30.0 && z <= 40.0){
-      return 1;
-    }
-
-    // Rest is filled with region 3
-    return 2;
-  }
-}
-
-
-
-Subdomain::Subdomain() :
-  idx_dir_set(0),
-  idx_group_set(0),
-  idx_zone_set(0),
-  num_groups(0),
-  num_directions(0),
-  num_zones(0),
-  group0(0),
-  direction0(0),
-  psi(NULL),
-  rhs(NULL),
-  sigt(NULL),
-  directions(NULL),
-  ell(NULL),
-  ell_plus(NULL),
-  phi(NULL),
-  phi_out(NULL)
-{
-  for(int dim = 0;dim < 3;++ dim){
-    plane_data[dim] = NULL;
-    old_plane_data[dim] = NULL;
-  }
-}
-Subdomain::~Subdomain(){
-  delete psi;
-  delete rhs;
-  delete sigt;
-  for(int dim = 0;dim < 3;++ dim){
-    delete plane_data[dim];
-    delete old_plane_data[dim];
-  }
-}
-
-
-/**
-  Setup subdomain and allocate data
-*/
-void Subdomain::setup(int sdom_id, Input_Variables *input_vars, int gs, int ds, int zs,
-    std::vector<Directions> &direction_list, Kernel *kernel, Layout *layout)
-{
-  // set the set indices
-  idx_group_set = gs;
-  idx_dir_set = ds;
-  idx_zone_set = zs;
-
-  num_groups = input_vars->num_groups / input_vars->num_groupsets;
-  group0 = gs * num_groups;
-
-  num_directions = input_vars->num_directions / input_vars->num_dirsets;
-  direction0 = ds * num_directions;
-  directions = &direction_list[direction0];
-
-  num_zones = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    // Compute number of zones in this dimension
-    nzones[dim] = layout->getNumZones(sdom_id, dim);
-    num_zones *= nzones[dim];
-
-    // Compute grid deltas in this dimension (including ghost zone deltas)
-    std::pair<double, double> dim_extent = layout->getSpatialExtents(sdom_id, dim);
-    zeros[dim] = dim_extent.first;
-    double dx = (dim_extent.second-dim_extent.first)/(double)nzones[dim];
-    deltas[dim].resize(nzones[dim]+2);
-    for(int z = 0;z < nzones[dim]+2;++ z){
-      deltas[dim][z] = dx;
-    }
-  }
-
-  index_size[IMaterial::getName()] = 3;
-  index_size[ILegendre::getName()] = input_vars->legendre_order+1;
-  index_size[IMoment::getName()] = (input_vars->legendre_order+1)*(input_vars->legendre_order+1);
-  index_size[IDirection::getName()] = num_directions;
-  index_size[IGroup::getName()] = num_groups;
-  index_size[IGlobalGroup::getName()] = input_vars->num_groups;
-  index_size[IZone::getName()] = num_zones;
-  index_size[IZoneIdx::getName()] = num_zones;
-  index_size[IZoneI::getName()] = nzones[0];
-  index_size[IZoneJ::getName()] = nzones[1];
-  index_size[IZoneK::getName()] = nzones[2];
-
-  // allocate storage for the sweep boundary data (upwind and downwind share same buffer)
-  plane_data[0] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[1] * nzones[2]);
-  plane_data[1] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[2]);
-  plane_data[2] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[1]);
-
-  // For block-jacobi parallel method
-  old_plane_data[0] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[1] * nzones[2]);
-  old_plane_data[1] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[2]);
-  old_plane_data[2] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[1]);
-
-  // allocate the storage for solution and source terms
-  psi = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, num_zones);
-  psi->clear(0.0);
-  rhs = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, num_zones);
-  sigt = new SubTVec(kernel->nestingSigt(), num_groups, 1, num_zones);
-  sigt->clear(0.0);
-
-  computeSweepIndexSet();
-
-  // Setup neighbor data
-  int dirs[3] = { directions[0].id, directions[0].jd, directions[0].kd};
-  for(int dim = 0;dim < 3;++ dim){
-    downwind[dim] = layout->getNeighbor(sdom_id, dim, dirs[dim]);
-    upwind[dim] = layout->getNeighbor(sdom_id, dim, -1 * dirs[dim]);
-  }
-
-  // paint the mesh
-  reg_volume[0] = 0.0;
-  reg_volume[1] = 0.0;
-  reg_volume[2] = 0.0;
-  int num_subsamples = 4; // number of subsamples per spatial dimension
-  double sample_vol_frac = 1.0 / (double)(num_subsamples*num_subsamples*num_subsamples);
-  int zone_id = 0;
-  double pz = zeros[2];
-
-  for (int k = 0; k < nzones[2]; k++) {
-    double sdz = deltas[2][k+1] / (double)(num_subsamples+1);
-    double py = zeros[1];
-
-    for (int j = 0; j != nzones[1]; j ++) {
-      double sdy = deltas[1][j+1] / (double)(num_subsamples+1);
-      double px = zeros[0];
-
-      for (int i = 0; i != nzones[0]; i ++) {
-        double sdx = deltas[0][i+1] / (double)(num_subsamples+1);
-
-        double zone_volume = deltas[0][i+1] * deltas[1][j+1] * deltas[2][k+1];
-        volume.push_back(zone_volume);
-
-        // subsample probe the geometry to get our materials
-        double frac[3] = {0.0, 0.0, 0.0}; // fraction of both materials
-        double spz = pz + sdz;
-
-        for(int sk = 0;sk < num_subsamples;++ sk){
-          double spy = py + sdy;
-          for(int sj = 0;sj < num_subsamples;++ sj){
-            double spx = px + sdx;
-            for(int si = 0;si < num_subsamples;++ si){
-
-              int mat = queryMaterial(spx, spy, spz);
-              frac[mat] += sample_vol_frac;
-
-              spx += sdx;
-            }
-            spy += sdy;
-          }
-          spz += sdz;
-        }
-
-        // Add material to zone
-        int nmixed = 0;
-        for(int mat = 0;mat < 3;++ mat){          
-          if(frac[mat] > 0.0){
-            nmixed ++;
-            if(nmixed == 1){
-              zones_to_mixed.push_back(mixed_to_zones.size());
-            }
-            mixed_to_zones.push_back(zone_id);
-            mixed_material.push_back(mat);
-            mixed_fraction.push_back(frac[mat]);
-            reg_volume[mat] += frac[mat] * zone_volume;
-            
-            // initialize background sigt
-            for(int g = 0;g < num_groups;++ g){
-              (*sigt)(g,0,zone_id) += frac[mat] * input_vars->sigt[mat];
-            }
-          }
-        }
-        num_mixed.push_back(nmixed);
-
-        // increment zone
-        px += deltas[0][i+1];
-        zone_id ++;
-      }
-      py += deltas[1][j+1];
-    }
-    pz += deltas[2][k+1];
-  }
-  
-  // store number of mixed elements
-  index_size[IMix::getName()] = mixed_to_zones.size();
-}
-
-void Subdomain::setVars(SubTVec *ell_ptr, SubTVec *ell_plus_ptr,
-    SubTVec *phi_ptr, SubTVec *phi_out_ptr){
-
-  ell = ell_ptr;
-  ell_plus = ell_plus_ptr;
-  phi = phi_ptr;
-  phi_out = phi_out_ptr;
-}
-
-
-/**
- * Randomizes data for a set.
- */
-void Subdomain::randomizeData(void){
-  psi->randomizeData();
-  rhs->randomizeData();
-  sigt->randomizeData();
-
-  for(int d = 0;d < 3;++ d){
-    for(int i = 0;i < deltas[d].size();++ i){
-      deltas[d][i] = drand48();
-    }
-  }
-}
-
-/**
- * Copies two sets, allowing for different nestings.
- */
-void Subdomain::copy(Subdomain const &b){
-  psi->copy(*b.psi);
-  rhs->copy(*b.rhs);
-  sigt->copy(*b.sigt);
-
-  for(int d = 0;d < 3;++ d){
-    deltas[d] = b.deltas[d];
-  }
-}
-
-/**
- * Compares two sets, allowing for different nestings.
- */
-bool Subdomain::compare(Subdomain const &b, double tol, bool verbose){
-  std::stringstream namess;
-  namess << "gdset[gs=" << idx_group_set << ", ds=" << idx_dir_set << ", zs=" << idx_zone_set << "]";
-  std::string name = namess.str();
-
-  bool is_diff = false;
-  is_diff |= psi->compare(name+".psi", *b.psi, tol, verbose);
-  is_diff |= rhs->compare(name+".rhs", *b.rhs, tol, verbose);
-  is_diff |= sigt->compare(name+".sigt", *b.sigt, tol, verbose);
-
-  is_diff |= compareVector(name+".deltas[0]", deltas[0], b.deltas[0], tol, verbose);
-  is_diff |= compareVector(name+".deltas[1]", deltas[1], b.deltas[1], tol, verbose);
-  is_diff |= compareVector(name+".deltas[2]", deltas[2], b.deltas[2], tol, verbose);
-
-  return is_diff;
-}
-
-/**
- * Compute sweep index sets.
- * Determines logical indices, and increments for i,j,k based on grid
- * information and quadrature set sweeping direction.
- */
-void Subdomain::computeSweepIndexSet(void){
-  if(directions[0].id > 0){
-    sweep_block.start_i = 0;
-    sweep_block.end_i = nzones[0];
-    sweep_block.inc_i = 1;
-  }
-  else {
-    sweep_block.start_i = nzones[0]-1;
-    sweep_block.end_i = -1;
-    sweep_block.inc_i = -1;
-  }
-
-  if(directions[0].jd > 0){
-    sweep_block.start_j = 0;
-    sweep_block.end_j = nzones[1];
-    sweep_block.inc_j = 1;
-  }
-  else {
-    sweep_block.start_j = nzones[1]-1;
-    sweep_block.end_j = -1;
-    sweep_block.inc_j = -1;
-  }
-
-  if(directions[0].kd > 0){
-    sweep_block.start_k = 0;
-    sweep_block.end_k = nzones[2];
-    sweep_block.inc_k =  1;
-  }
-  else {
-    sweep_block.start_k = nzones[2]-1;
-    sweep_block.end_k = -1;
-    sweep_block.inc_k = -1;
-  }
-
-  sweep_block.indexset_sweep = RAJA::IndexSet();
-  int N = nzones[0];
-  if (nzones[1] > N) N=nzones[1];
-  if (nzones[2] > N) N=nzones[2];
-
-  int i_inc = sweep_block.inc_i;
-  int j_inc = sweep_block.inc_j;
-  int k_inc = sweep_block.inc_k;
-  int i_min, i_max, j_min, j_max, k_min, k_max;
-  int counter = 0;
-  int Nslices = 0;
-  int offset[3*N+4];
-  offset[0] = 0;
-
-  if ( i_inc == 1){
-    i_min = sweep_block.start_i;
-    i_max = sweep_block.end_i-1;
-  }
-  else{
-    i_min = sweep_block.end_i+1;
-    i_max = sweep_block.start_i;
-  }
-  if ( j_inc == 1){
-    j_min = sweep_block.start_j;
-    j_max = sweep_block.end_j-1;
-  }
-  else{
-    j_min = sweep_block.end_j+1;
-    j_max = sweep_block.start_j;
-  }
-  if ( k_inc == 1){
-    k_min = sweep_block.start_k;
-    k_max = sweep_block.end_k-1;
-  }
-  else{
-    k_min = sweep_block.end_k+1;
-    k_max = sweep_block.start_k;
-  }
-  int ii_tmp = (1 - i_inc)/2*i_max;
-  int jj_tmp = (1 - j_inc)/2*j_max;
-  int kk_tmp = (1 - k_inc)/2*k_max;
-
-
-  //sweep_block.ii_jj_kk_z_idx = new int[nzones[0]*nzones[1]*nzones[2]*4];
-  sweep_block.idx_to_i = new int[nzones[0]*nzones[1]*nzones[2]];
-  sweep_block.idx_to_j = new int[nzones[0]*nzones[1]*nzones[2]];
-  sweep_block.idx_to_k = new int[nzones[0]*nzones[1]*nzones[2]];
-  sweep_block.idx_to_z = new int[nzones[0]*nzones[1]*nzones[2]];
-  
-  if(false){ // try a dummy sweep pattern.. this will give you the WRONG answer.. just to understand memory performance
-    int z = 0;
-    int zstart = z;
-    for(int k = 0;k < nzones[2];++k){          
-      for(int j = 0;j < nzones[1];++j){        
-        for(int i = 0;i < nzones[0];++i){ 
-          sweep_block.idx_to_i[z] = i;
-          sweep_block.idx_to_j[z] = j;
-          sweep_block.idx_to_k[z] = k;
-          sweep_block.idx_to_z[z] = z;
-          ++ z;
-        }        
-      }
-      
-    }
-    sweep_block.indexset_sweep.push_back(
-            RAJA::RangeSegment(zstart, z)); 
-  }
-  else{
-    for (int C = 0; C <=(3*N); ++C){   //for each C we can touch zone["i","j","k"]  as well as "d" and "group"    in parallel
-     int FLAG=0;
-     for (int i = 0; i <= C; ++i){
-       for (int j = 0; j <= C; ++j){
-          int k = C - i - j; // surface equation i+j+j=C
-          //flip if needed
-
-          int ii = ii_tmp + i*i_inc;
-          int jj = jj_tmp + j*j_inc;
-          int kk = kk_tmp + k*k_inc;
-
-          if (ii <= i_max && jj <= j_max && kk <= k_max && ii >= i_min && jj >= j_min && kk >= k_min){
-
-            sweep_block.idx_to_i[counter] = ii;
-            sweep_block.idx_to_j[counter] = jj;
-            sweep_block.idx_to_k[counter] = kk;
-            sweep_block.idx_to_z[counter] = ii + nzones[0]*jj + nzones[0]*nzones[1]*kk;//  Zonal_INDEX(ii, jj, kk);
-            counter++; //counts all elements
-            FLAG++;   //counts elements per slice
-         }
-       }
-     }
-     if (FLAG){
-        Nslices++;
-        offset[Nslices] = offset[Nslices-1] + FLAG;
-
-        // an index set which describes each hyperplane as a RangeSegment
-        sweep_block.indexset_sweep.push_back(
-            RAJA::RangeSegment(offset[Nslices-1], offset[Nslices]));
-     }
-   }
-   } // hyper-plane generator
-}
-
-namespace {
-  double FactFcn(int n)
-  {
-    double fact = 1.0;
-    for(int i = n;i > 0 ;--i){
-      fact *= (double)i;
-    }
-    return(fact);
-  }
-
-  inline double PnmFcn(int n, int m, double x)
-  {
-    /*-----------------------------------------------------------------
-     * It is assumed that 0 <= m <= n and that abs(x) <= 1.0.
-     * No error checking is done, however.
-     *---------------------------------------------------------------*/
-    double fact, pnn, pmm, pmmp1, somx2;
-
-    int i, nn;
-
-    if(std::abs(x) > 1.0){
-      KripkeAbort("Bad input to ardra_PnmFcn: abs(x) > 1.0, x = %e\n", x);
-    }
-    else if((x > 1.0) && (x <= 1.0)){
-      x = 1.0;
-    }
-    else if((-1.0 <= x ) && (x < -1.0)){
-      x = -1.0;
-    }
-
-    pmm=1.0;
-    if(m > 0){
-      somx2=sqrt((1.0-x)*(1.0+x));
-      fact=1.0;
-      for(i=1; i<=m; i++){
-        pmm *= -fact*somx2;
-        fact += 2.0;
-      }
-    }
-    if(n == m){
-      return(pmm);
-    }
-    else {
-      pmmp1=x*(2*m+1)*pmm;
-      if(n == (m+1)){
-        return(pmmp1);
-      }
-      else {
-        for(nn=m+2; nn<=n; nn++){
-          pnn=(x*(2*nn-1)*pmmp1-(nn+m-1)*pmm)/(nn-m);
-          pmm=pmmp1;
-          pmmp1=pnn;
-        }
-        return(pnn);
-      }
-    }
-  }
-
-  inline double YnmFcn(int n, int m, double mu, double eta, double xi)
-  {
-    double fac1, fac2, anm, ynm, pnm, dm0, taum, tmp, phi, phi_tmp;
-    double floor=1.e-20;
-    int nn, mm;
-
-    /* Calculate the correct phi for omega=(mu,eta,xi) */
-    tmp = fabs(eta/(mu+floor));
-    phi_tmp = atan(tmp);
-    if( (mu>0) && (eta>0) ){
-      phi = phi_tmp;
-    }
-    else if( (mu<0) && (eta>0) ){
-      phi = M_PI - fabs(phi_tmp);
-    }
-    else if( (mu<0) && (eta<0) ){
-      phi = M_PI + fabs(phi_tmp);
-    }
-    else {
-      phi = 2.0*M_PI - fabs(phi_tmp);
-    }
-
-    /* Begin evaluation of Ynm(omega) */
-    nn = n - std::abs(m);
-    fac1 = (double) FactFcn(nn);
-    nn = n + std::abs(m);
-    fac2 = (double) FactFcn(nn);
-    mm = std::abs(m);
-    pnm = PnmFcn(n, mm, xi);
-    tmp = ((double) m)*phi;
-    if(m >= 0){
-      taum = cos(tmp);
-    }
-    else {taum = sin(-tmp); }
-    if(m == 0){
-      dm0 = 1.0;
-    }
-    else {dm0 = 0.0; }
-    tmp = ((2*n+1)*fac1)/(2.0*(1.0+dm0)*M_PI*fac2);
-    anm = sqrt( tmp );
-    ynm = anm*pnm*taum;
-    return(ynm);
-  }
-}
-
-/**
- * Compute L and L+
- * This assumes that the quadrature set is defined.
- */
-void Subdomain::computeLLPlus(int legendre_order){
-  double SQRT4PI = std::sqrt(4*M_PI);
-  for(int n=0, nm=0; n < legendre_order+1; n++){
-    for(int m=-n; m<=n; m++){
-      for(int d=0; d<num_directions; d++){
-        // Get quadrature point info
-        double xcos = (directions[d].id)*(directions[d].xcos);
-        double ycos = (directions[d].jd)*(directions[d].ycos);
-        double zcos = (directions[d].kd)*(directions[d].zcos);
-        double w =  directions[d].w;
-
-        double ynm = YnmFcn(n, m, xcos, ycos, zcos);
-
-        // Compute element of L and L+
-        (*ell)(nm,d,0) = w*ynm/SQRT4PI;
-        (*ell_plus)(nm,d,0) = ynm*SQRT4PI;
-      }
-      nm ++;
-    }
-  }
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.h
deleted file mode 100644
index e8cce9f33..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Subdomain.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_SUBDOMAIN_H__
-#define KRIPKE_SUBDOMAIN_H__
-
-#include <vector>
-#include <string>
-#include <map>
-#include <Kripke/Layout.h>
-#include <RAJA/IndexSet.hxx>
-
-// Foreward Decl
-struct Directions;
-struct SubTVec;
-struct Input_Variables;
-class Kernel;
-
-/**
- * Provides sweep index sets for a given octant.
- * This generalizes the sweep pattern, and allows for experimenting with
- * a tiled approach to on-node sweeps.
- */
-struct Grid_Sweep_Block {
-  int start_i, start_j, start_k; // starting index
-  int end_i, end_j, end_k; // termination conditon (one past)
-  int inc_i, inc_j, inc_k; // increment
-
-  // Index set to describe sweep iteration pattern
-  RAJA::IndexSet indexset_sweep;
-
-  // mappings from index set back to zones and i,j,k
-  int *idx_to_i;
-  int *idx_to_j;
-  int *idx_to_k;
-  int *idx_to_z;
-};
-
-
-
-/**
- * Contains parameters and variables that describe a single Group Set and
- * Direction Set.
- */
-struct Subdomain {
-  Subdomain();
-  ~Subdomain();
-
-  void setup(int sdom_id, Input_Variables *input_vars, int gs, int ds, int zs,
-    std::vector<Directions> &direction_list, Kernel *kernel, Layout *layout);
-
-  void setVars(SubTVec *ell_ptr, SubTVec *ell_plus_ptr,
-    SubTVec *phi_ptr, SubTVec *phi_out_ptr);
-
-  void randomizeData(void);
-  void copy(Subdomain const &b);
-  bool compare(Subdomain const &b, double tol, bool verbose);
-  void computeSweepIndexSet(void);
-  void computeLLPlus(int legendre_order);
-
-  int idx_group_set;
-  int idx_dir_set;
-  int idx_zone_set;
-
-  int num_groups;       // Number of groups in this set
-  int num_directions;   // Number of directions in this set
-  int num_zones;        // Number of zones in this set
-
-  double zeros[3];                     // origin of local mesh
-  int nzones[3];                    // Number of zones in each dimension
-  std::vector<double> deltas[3];    // Spatial grid deltas in each dimension (including ghost zones)
-
-  int group0;           // Starting global group id
-  int direction0;       // Starting global direction id
-
-  Grid_Sweep_Block sweep_block;
-
-  // Neighbors
-  Neighbor upwind[3];   // Upwind dependencies in x,y,z
-  Neighbor downwind[3]; // Downwind neighbors in x,y,z
-
-  // Sweep boundary data
-  SubTVec *plane_data[3];
-  SubTVec *old_plane_data[3];
-
-  // Variables
-  SubTVec *psi;         // Solution
-  SubTVec *rhs;         // RHS, source term
-  SubTVec *sigt;        // Zonal per-group cross-section
-
-  // Pointers into directions and directionset data from Grid_Data
-  Directions *directions;
-  SubTVec *ell;
-  SubTVec *ell_plus;
-  SubTVec *phi;
-  SubTVec *phi_out;
-
-  // Materials on the mesh, used for scattering lookup
-  double reg_volume[3];               // volume of each material region
-  std::vector<double> volume;         // volume of each zone
-  std::vector<int> mixed_to_zones;    // mapping from mixed slot to zones
-  std::vector<int> num_mixed;         // mapping from mixed slot to zones
-  std::vector<int> zones_to_mixed;    // mapping from zones to first mixed slot
-  std::vector<int> mixed_material;    // material number for each mixed slot
-  std::vector<double> mixed_fraction; // volume fraction each mixed slot
-  
-  // Index information
-  std::map<std::string, int> index_size; // size of each Index
-  
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Sweep_Solver.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Sweep_Solver.cpp
deleted file mode 100644
index b97075424..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Sweep_Solver.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/ParallelComm.h>
-#include <Kripke/Grid.h>
-#include <vector>
-#include <stdio.h>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-/**
-  Run solver iterations.
-*/
-int SweepSolver (Grid_Data *grid_data, bool block_jacobi)
-{
-  Kernel *kernel = grid_data->kernel;
-
-  int mpi_rank = 0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-#endif
-
-  BLOCK_TIMER(grid_data->timing, Solve);
-
-
-  // Loop over iterations
-  double part_last = 0.0;
-  for(int iter = 0;iter < grid_data->niter;++ iter){
-
-    /*
-     * Compute the RHS:  rhs = LPlus*S*L*psi + Q
-     */
-
-    // Discrete to Moments transformation (phi = L*psi)
-    {
-      BLOCK_TIMER(grid_data->timing, LTimes);
-      kernel->LTimes(grid_data);
-    }
-
-    // Compute Scattering Source Term (psi_out = S*phi)
-    {
-      BLOCK_TIMER(grid_data->timing, Scattering);
-      kernel->scattering(grid_data);
-    }
-
-    // Compute External Source Term (psi_out = psi_out + Q)
-    {
-      BLOCK_TIMER(grid_data->timing, Source);
-      kernel->source(grid_data);
-    }
-
-    // Moments to Discrete transformation (rhs = LPlus*psi_out)
-    {
-      BLOCK_TIMER(grid_data->timing, LPlusTimes);
-      kernel->LPlusTimes(grid_data);
-    }
-
-    /*
-     * Sweep (psi = Hinv*rhs)
-     */
-    {
-      BLOCK_TIMER(grid_data->timing, Sweep);
-
-      if(true){
-        // Create a list of all groups
-        std::vector<int> sdom_list(grid_data->subdomains.size());
-        for(int i = 0;i < grid_data->subdomains.size();++ i){
-          sdom_list[i] = i;
-        }
-
-        // Sweep everything
-        SweepSubdomains(sdom_list, grid_data, block_jacobi);
-      }
-      // This is the ARDRA version, doing each groupset sweep independently
-      else{
-        for(int group_set = 0;group_set < grid_data->num_group_sets;++ group_set){
-          std::vector<int> sdom_list;
-          // Add all subdomains for this groupset
-          for(int s = 0;s < grid_data->subdomains.size();++ s){
-            if(grid_data->subdomains[s].idx_group_set == group_set){
-              sdom_list.push_back(s);
-            }
-          }
-
-          // Sweep the groupset
-          SweepSubdomains(sdom_list, grid_data, block_jacobi);
-        }
-      }
-    }
-
-    {
-      BLOCK_TIMER(grid_data->timing, ParticleEdit);
-      double part = kernel->particleEdit(grid_data);
-      if(mpi_rank==0){
-        printf("iter %d: particle count=%e, change=%e\n", iter, part, (part-part_last)/part);
-      }
-      part_last = part;
-    }
-  }
-  return(0);
-}
-
-
-
-/**
-  Perform full parallel sweep algorithm on subset of subdomains.
-*/
-void SweepSubdomains (std::vector<int> subdomain_list, Grid_Data *grid_data, bool block_jacobi)
-{
-  // Create a new sweep communicator object
-  ParallelComm *comm = NULL;
-  if(block_jacobi){
-    comm = new BlockJacobiComm(grid_data);
-  }
-  else {
-    comm = new SweepComm(grid_data);
-  }
-
-  // Add all subdomains in our list
-  for(int i = 0;i < subdomain_list.size();++ i){
-    int sdom_id = subdomain_list[i];
-    comm->addSubdomain(sdom_id, grid_data->subdomains[sdom_id]);
-  }
-
-  /* Loop until we have finished all of our work */
-  while(comm->workRemaining()){
-
-    // Get a list of subdomains that have met dependencies
-    std::vector<int> sdom_ready = comm->readySubdomains();
-    int backlog = sdom_ready.size();
-
-    // Run top of list
-    if(backlog > 0){
-      int sdom_id = sdom_ready[0];
-      Subdomain &sdom = grid_data->subdomains[sdom_id];
-      // Clear boundary conditions
-      for(int dim = 0;dim < 3;++ dim){
-        if(sdom.upwind[dim].subdomain_id == -1){
-          sdom.plane_data[dim]->clear(0.0);
-        }
-      }
-      {
-        BLOCK_TIMER(grid_data->timing, Sweep_Kernel);
-        // Perform subdomain sweep
-        grid_data->kernel->sweep(grid_data, sdom_id);
-      }
-
-      // Mark as complete (and do any communication)
-      comm->markComplete(sdom_id);
-    }
-  }
-
-  delete comm;
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.cpp
deleted file mode 100644
index 1ee5134bb..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/Test/TestKernels.h>
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Input_Variables.h>
-
-/**
- * Functional object to run the LTimes kernel.
- */
-struct runLTimes {
-  std::string name(void) const { return "LTimes"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->LTimes(grid_data);
-  }
-};
-
-/**
- * Functional object to run the LPlusTimes kernel.
- */
-struct runLPlusTimes {
-  std::string name(void) const { return "LPlusTimes"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->LPlusTimes(grid_data);
-  }
-};
-
-
-/**
- * Functional object to run the scattering kernel.
- */
-struct runScattering {
-  std::string name(void) const { return "scattering"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->scattering(grid_data);
-  }
-};
-
-
-/**
- * Functional object to run the source kernel.
- */
-struct runSource {
-  std::string name(void) const { return "source"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->source(grid_data);
-  }
-};
-
-/**
- * Functional object to run the MPI sweep and sweep kernels
- */
-struct runSweep {
-  std::string name(void) const { return "Sweep"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    std::vector<int> sdom_list(grid_data->subdomains.size());
-    for(int i = 0;i < grid_data->subdomains.size();++ i){
-      sdom_list[i] = i;
-    }
-    SweepSubdomains(sdom_list, grid_data, false);
-  }
-};
-
-
-/**
- * Tests a specific kernel (using one of the above runXXX functional objects).
- */
-template<typename KernelRunner>
-void testKernel(Input_Variables &input_variables){
-  int myid=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-
-  KernelRunner kr;
-
-  if(myid == 0){
-    printf("  Comparing %s to %s for kernel %s\n",
-      nestingString(NEST_GDZ).c_str(),
-      nestingString(input_variables.nesting).c_str(),
-      kr.name().c_str());
-  }
-
-  // Allocate two problems (one reference)
-  if(myid == 0)printf("    -- allocating\n");
-  Grid_Data *grid_data = new Grid_Data(&input_variables);
-
-  Nesting_Order old_nest = input_variables.nesting;
-  input_variables.nesting = NEST_GDZ;
-  Grid_Data *ref_data = new Grid_Data(&input_variables);
-  input_variables.nesting = old_nest;
-
-  // Generate random data in the reference problem, and copy it to the other
-  if(myid == 0)printf("    -- randomizing data\n");
-  ref_data->randomizeData();
-  grid_data->copy(*ref_data);
-
-  if(myid == 0)printf("    -- running kernels\n");
-
-  // Run both kernels
-  kr(ref_data);
-  kr(grid_data);
-
-  if(myid == 0)printf("    -- comparing results\n");
-  // Compare differences
-  bool is_diff = ref_data->compare(*grid_data, 1e-12, true);
-  if(is_diff){
-    if(myid == 0)KripkeAbort("Differences found, bailing out\n");
-  }
-
-  // Cleanup
-  if(myid == 0)printf("    -- OK\n\n");
-  delete grid_data;
-  delete ref_data;
-}
-
-
-/**
- * Tests all kernels given the specified input.
- */
-void testKernels(Input_Variables &input_variables){
-  // Run LTimes
-  testKernel<runLTimes>(input_variables);
-
-  // Run LPlusTimes
-  testKernel<runLPlusTimes>(input_variables);
-
-  // Run Scattering
-  testKernel<runScattering>(input_variables);
-
-  // Run Source
-  testKernel<runSource>(input_variables);
-
-  // Run Sweep
-  testKernel<runSweep>(input_variables);
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.h
deleted file mode 100644
index 2330e657d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Test/TestKernels.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_TOOLS_TEST_KERNELS_H__
-#define KRIPKE_TOOLS_TEST_KERNELS_H__
-
-struct Input_Variables;
-
-void testKernels(Input_Variables &input_variables);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.cpp b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.cpp
deleted file mode 100644
index 2bb01dcc3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/Timing.h>
-#ifdef RAJA_USE_CALIPER
-#include <caliper/Annotation.h>
-#endif
-
-#include<Kripke.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <vector>
-#include <sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include <mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_BGPM
-extern "C" void HPM_Start(char const *);
-extern "C" void HPM_Stop(char const *);
-#endif
-
-
-#ifdef KRIPKE_USE_PAPI
-#include <papi.h>
-#endif
-
-
-Timing::~Timing(){
-#ifdef KRIPKE_USE_PAPI
-long long tmp[16];
-PAPI_stop(papi_set, tmp);
-#endif
-
-}
-
-void Timing::start(std::string const &name){
-  // get or create timer
-  Timer &timer = timers[name];
-
-  if(!timer.started){
-    timer.started = true;
-    timer.start_time = getTime();
-
-#ifdef KRIPKE_USE_PAPI
-    int num_papi = papi_event.size();
-    if(num_papi > 0){
-      if(timer.papi_total.size() == 0){
-        timer.papi_start_values.resize(num_papi, 0);
-        timer.papi_total.resize(num_papi, 0);
-      }
-
-      /*
-      // start timers
-      PAPI_start_counters(&papi_event[0], num_papi);
-
-      // clear timers
-      long long tmp[16];
-      PAPI_read_counters(tmp, num_papi);
-      */
-
-      // read initial values
-      PAPI_read(papi_set, &timer.papi_start_values[0]);
-
-    }
-#endif
-
-#ifdef RAJA_USE_CALIPER
-    cali::Annotation(name.c_str()).begin();
-#endif
-#ifdef KRIPKE_USE_BGPM
-    HPM_Start(name.c_str());
-#endif
-  }
-}
-
-void Timing::stop(std::string const &name){
-  // get or create timer
-  Timer &timer = timers[name];
-
-#ifdef KRIPKE_USE_BGPM
-    HPM_Stop(name.c_str());
-#endif
-
-#ifdef RAJA_USE_CALIPER
-    cali::Annotation(name.c_str()).end();
-#endif
-  if(timer.started){
-#ifdef KRIPKE_USE_PAPI
-    int num_papi = papi_event.size();
-    if(num_papi > 0){
-      // read timers
-      long long tmp[16];
-      //PAPI_stop_counters(tmp, num_papi);
-      PAPI_read(papi_set, tmp);
-
-      // accumulate to all started timers (since this clears the PAPI values)
-      for(int i = 0;i < num_papi;++ i){
-        timer.papi_total[i] += tmp[i] - timer.papi_start_values[i];
-      }
-
-    }
-#endif
-
-    // Stop the timer
-    timer.started = false;
-    timer.total_time += getTime() - timer.start_time;
-    timer.count ++;
-
-  }
-}
-
-void Timing::stopAll(void){
-  for(TimerMap::iterator i = timers.begin();i != timers.end();++ i){
-    stop((*i).first);
-  }
-}
-
-void Timing::clear(void){
-  timers.clear();
-}
-
-void Timing::print(void) const {
-  int rank=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-#endif
-  if(rank != 0){
-    return;
-  }
-
-  // build a sorted list of names
-  std::vector<std::string> names;
-  for(TimerMap::const_iterator i = timers.begin();i != timers.end();++ i){
-    names.push_back((*i).first);
-
-  }
-  std::sort(names.begin(), names.end());
-
-  std::vector<Timer const *> ord_timers;
-  for(int i = 0;i < names.size();++ i){
-    std::string &name = names[i];
-    TimerMap::const_iterator iter = timers.find(name);
-    ord_timers.push_back(&(*iter).second);
-  }
-
-  // Display column names
-  printf("Timers:\n");
-  printf("  %-16s  %12s  %12s", "Timer", "Count", "Seconds");
-#ifdef KRIPKE_USE_PAPI
-  int num_papi = papi_names.size();
-  for(int i = 0;i < num_papi;++i){
-    printf("  %16s", papi_names[i].c_str());
-  }
-#endif
-  printf("\n");
-
-  // Dislpay timer results
-  for(int i = 0;i < names.size();++ i){
-    printf("  %-16s  %12d  %12.5lf", names[i].c_str(), (int)ord_timers[i]->count, ord_timers[i]->total_time);
-#ifdef KRIPKE_USE_PAPI
-    for(int p = 0;p < num_papi;++ p){
-      printf("  %16ld", (long)ord_timers[i]->papi_total[p]);
-    }
-#endif
-    printf("\n");
-  }
-  
-  // Now display timers in machine readable format
-  printf("\n");
-  printf("TIMER_NAMES:");
-  for(int i = 0;i < names.size();++ i){
-    if(i > 0){
-      printf(",");
-    }
-    printf("%s", names[i].c_str());
-  }
-  printf("\n");
-  printf("TIMER_DATA:");
-  for(int i = 0;i < names.size();++ i){
-    if(i > 0){
-      printf(",");
-    }
-    printf("%lf", ord_timers[i]->total_time);    
-  }
-  printf("\n");
-}
-
-
-double Timing::getTotal(std::string const &name) const{
-  TimerMap::const_iterator i = timers.find(name);
-  if(i == timers.end()){
-    return 0.0;
-  }
-  return (*i).second.total_time;
-}
-
-
-
-void Timing::setPapiEvents(std::vector<std::string> names){
-#ifdef KRIPKE_USE_PAPI
-
-
-  static bool papi_initialized = false;
-  if(!papi_initialized){
-    //printf("PAPI INIT\n");
-    int retval = PAPI_library_init(PAPI_VER_CURRENT);
-    papi_initialized = true;
-
-    if(retval != PAPI_VER_CURRENT){
-      fprintf(stderr, "ERROR INITIALIZING PAPI\n");
-      exit(1);
-    }
-  }
-
-  //printf("PAPI VERSION=%x\n",
-  //    PAPI_VERSION);
-
-  papi_set = PAPI_NULL;
-  PAPI_create_eventset(&papi_set);
-
-
-  for(int i = 0;i < names.size();++ i){
-    // Convert text string to PAPI id
-    int event_code;
-    PAPI_event_name_to_code(
-        const_cast<char*>(names[i].c_str()),
-        &event_code);
-
-    // TODO: error checking?
-
-    // Add to our list of PAPI events
-    papi_names.push_back(names[i]);
-    papi_event.push_back(event_code);
-
-    int retval = PAPI_add_event(papi_set, event_code);
-    if(retval != PAPI_OK){
-      fprintf(stderr, "ERROR ADDING %s, retval=%d, ID=0x%-10x\n", names[i].c_str(), retval, event_code);
-    }
-
-    //printf("EVT=%s, ID=0x%-10x\n", names[i].c_str(), event_code);
-  }
-  PAPI_start(papi_set);
-#else
-  if(names.size() > 0){
-    fprintf(stderr, "WARNING: PAPI NOT ENABLED, IGNORING PAPI EVENTS\n");
-  }
-#endif
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.h
deleted file mode 100644
index 101f138bc..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/Kripke/Timing.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_TIMING_H__
-#define KRIPKE_TIMING_H__
-
-#include <string>
-#include <vector>
-#include <map>
-#include <stdio.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifdef KRIPKE_USE_PAPI
-#include<papi.h>
-#endif
-
-inline double getTime(void){
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec/1000000.0;
-}
-
-
-struct Timer {
-  Timer() :
-    started(false),
-    start_time(0.0),
-    total_time(0.0),
-    count(0)
-  {}
-
-  bool started;
-  double start_time;
-  double total_time;
-  size_t count;
-#ifdef KRIPKE_USE_PAPI
-  std::vector<long long> papi_start_values;
-  std::vector<size_t> papi_total;
-#endif
-};
-
-class Timing {
-  public:
-    ~Timing();
-
-    void start(std::string const &name);
-    void stop(std::string const &name);
-
-    void stopAll(void);
-    void clear(void);
-
-    void print(void) const;
-    double getTotal(std::string const &name) const;
-
-    void setPapiEvents(std::vector<std::string> names);
-
-  private:
-    typedef std::map<std::string, Timer> TimerMap;
-    TimerMap timers;
-#ifdef KRIPKE_USE_PAPI
-  std::vector<std::string> papi_names;
-  std::vector<int> papi_event;
-  int papi_set;
-#endif
-};
-
-
-#include<stdio.h>
-
-// Aides timing a block of code, with automatic timer stopping
-class BlockTimer {
-  public:
-  inline BlockTimer(Timing &timer_obj, std::string const &timer_name) :
-      timer(timer_obj),
-      name(timer_name)
-  {
-      timer.start(name);
-  }
-  inline ~BlockTimer(){
-    timer.stop(name);
-  }
-
-  private:
-      Timing &timer;
-      std::string name;
-};
-
-#define BLOCK_TIMER(TIMER, NAME) BlockTimer BLK_TIMER_##NAME(TIMER, #NAME);
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/KripkeRAJA.h b/test/Kripke-v1.1/Kripke-v1.1-RAJA/KripkeRAJA.h
deleted file mode 100644
index 36a45927d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/KripkeRAJA.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef KRIPKERAJA_H__
-#define KRIPKERAJA_H__
-
-#include<RAJA/RAJA.hxx>
-#include<Kripke/Grid.h>
-#include<Kripke/Subdomain.h>
-
-
-//#define RAJA_INLINE __attribute__((always_inline))
-
-
-#define RAJA_LAMBDA [=]
-//#define RAJA_LAMBDA [=] __device__
-
-
-// All of our OpenMP execution policies are swapped out with sequential if
-// an OpenMP compiler is not available.
-// Note:  It is always safe to replace OpenMP loops with sequential loops for
-// this code.
-#ifdef RAJA_ENABLE_OPENMP
-using kripke_omp_for_nowait_exec = RAJA::omp_for_nowait_exec;
-using kripke_omp_collapse_nowait_exec = RAJA::omp_collapse_nowait_exec;
-
-template<typename T>
-using kripke_OMP_Parallel = RAJA::OMP_Parallel<T>;
-
-#else
-typedef RAJA::simd_exec kripke_omp_for_nowait_exec;
-typedef RAJA::simd_exec kripke_omp_collapse_nowait_exec;
-
-template<typename T>
-using kripke_OMP_Parallel = RAJA::Execute;
-
-#endif
-
-// Subdomain loops
-template<typename SubdomainPolicy, typename BODY>
-RAJA_INLINE void forallSubdomains(Grid_Data *grid_data, BODY body){
-
-  RAJA::forall<SubdomainPolicy>(
-    RAJA::RangeSegment(0, grid_data->subdomains.size()),
-    [=](int sdom_id){
-      // get subdomain object
-      Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-      body(sdom_id, sdom);
-    });
-
-}
-
-// Loop over zoneset subdomains
-template<typename SubdomainPolicy, typename BODY>
-RAJA_INLINE void forallZoneSets(Grid_Data *grid_data, BODY body){
-
-  RAJA::forall<SubdomainPolicy>(
-    RAJA::RangeSegment(0, grid_data->num_zone_sets),
-    [=](int zs){
-      // get material mix information
-      int sdom_id = grid_data->zs_to_sdomid[zs];
-      Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-      body(zs, sdom_id, sdom);
-    });
-
-}
-
-
-
-//#define KRIPKE_USE_PSCOPE
-
-#ifdef KRIPKE_USE_PSCOPE
-
-
-#define FORALL_SUBDOMAINS(SDOM_POL, DOMAIN, ID, SDOM) \
-  forallSubdomains<SDOM_POL>(DOMAIN, [&](int ID, Subdomain &SDOM){
-
-#define FORALL_ZONESETS(SDOM_POL, DOMAIN, ID, SDOM) \
-  forallZoneSets<SDOM_POL>(DOMAIN, [&](int zone_set, int ID, Subdomain &SDOM){
-
-
-#define END_FORALL });
-
-#else
-
-// Eliminates policy scope outer lambda,
-#define BEGIN_POLICY(NVAR, NTYPE) \
-  { \
-    typedef NEST_DGZ_T NTYPE;
-
-#define END_POLICY }
-
-
-#define FORALL_SUBDOMAINS(SDOM_POL, DOMAIN, ID, SDOM) \
-  for(int ID = 0;ID < DOMAIN.subdomains.size();++ ID){ \
-    Subdomain &SDOM = DOMAIN.subdomains[ID];
-
-#define FORALL_ZONESETS(SDOM_POL, DOMAIN, ID, SDOM) \
-  for(int _zset_idx = 0;_zset_idx < DOMAIN.num_zone_sets;++ _zset_idx){ \
-    int ID = DOMAIN.zs_to_sdomid[_zset_idx]; \
-    Subdomain &SDOM = DOMAIN.subdomains[ID];
-
-
-#define END_FORALL }
-
-#endif
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/NOTICE.md b/test/Kripke-v1.1/Kripke-v1.1-RAJA/NOTICE.md
deleted file mode 100644
index f2dff6f71..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/NOTICE.md
+++ /dev/null
@@ -1,40 +0,0 @@
-LLNL-CODE-658597
-Title: Kripke, Version: 1.1
-Author(s) Adam J. Kunen, etc. all......
-
-
-NOTICE
-======
-
-This work was produced at the Lawrence Livermore National Laboratory (LLNL) 
-under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S. Department
-of Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the 
-operation of LLNL. The rights of the Federal Government are reserved under 
-Contract 44.
-
-
-DISCLAIMER
-==========
-
-This work was prepared as an account of work sponsored by an agency of the 
-United States Government. Neither the United States Government nor Lawrence 
-Livermore National Security, LLC nor any of their employees, makes any 
-warranty, express or implied, or assumes any liability or responsibility for 
-the accuracy, completeness, or usefulness of any information, apparatus, 
-product, or process disclosed, or represents that its use would not infringe 
-privately-owned rights. Reference herein to any specific commercial products, 
-process, or service by trade name, trademark, manufacturer or otherwise does 
-not necessarily constitute or imply its endorsement, recommendation, or 
-favoring by the United States Government or Lawrence Livermore National 
-Security, LLC. The views and opinions of authors expressed herein do not 
-necessarily state or reflect those of the United States Government or Lawrence 
-Livermore National Security, LLC, and shall not be used for advertising or 
-product endorsement purposes.
-
-
-NOTIFICATION OF COMMERCIAL USE
-==============================
-
-Commercialization of this product is prohibited without notifying the 
-Department of Energy (DOE) or Lawrence Livermore National Security.
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-RAJA/README.md b/test/Kripke-v1.1/Kripke-v1.1-RAJA/README.md
deleted file mode 100644
index 6daebb726..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-RAJA/README.md
+++ /dev/null
@@ -1,363 +0,0 @@
-KRIPKE
-======
-
-Version 1.1
-
-Release Date 9/13/2015 
-
-
-Authors
-=======
-  * Adam J. Kunen [kunen1@llnl.gov](mailto:kunen1@llnl.gov) (Primary point of contact)
-  * Peter N. Brown [brown42@llnl.gov](mailto:brown42@llnl.gov)
-  * Teresa S. Bailey [bailey42@llnl.gov](mailto:bailey42@llnl.gov)
-  * Peter G. Maginot [maginot1@llnl.gov](mailto:maginot1@llnl.gov)
-
-
-License
-=======
-See included file NOTICE.md
-
-RAJA Version
-============
-This is a variant of Kripke that uses RAJA::forallN for its compute kernels.
-See Kripke/Kernel.cpp for all of these kernels.
-
-Currently there are issues building with the Intel compiler which are being
-investigated.  As a result, building the RAJA version of Kripke with icpc 
-will diable all but the DGZ data layouts, and disables complex execution
-policies.  Once a resolution to these issues have been found, this code will
-be updated.
-
-Overview
-========
-Kripke is a simple, scalable, 3D Sn deterministic particle transport code.  Its
-primary purpose is to research how data layout, programming paradigms and 
-architectures effect the implementation and performance of Sn transport.  A 
-main goal of Kripke is investigating how different data-layouts affect 
-instruction, thread and task level parallelism, and what the implications are 
-on overall solver performance.
-
-Kripkie supports storage of angular fluxes (Psi) using all six striding orders 
-(or "nestings") of Directions (D), Groups (G), and Zones (Z), and provides 
-computational kernels specifically written for each of these nestings. Most Sn 
-transport codes are designed around one of these nestings, which is an 
-inflexibility that leads to software engineering compromises when porting to 
-new architectures and programming paradigms.
-
-Early research has found that the problem dimensions (zones, groups, 
-directions, scattering order) and the scaling (number of threads and MPI 
-tasks), can make a profound difference in the performance of each of these 
-nestings. To our knowledge this is a capability unique to Kripke, and should 
-provide key insight into how data-layout effects Sn solver performance. An 
-asynchronous MPI-based parallel sweep algorithm is provided, which employs the 
-concepts of Group Sets (GS) Zone Sets (ZS), and Direction Sets (DS), borrowed 
-from the [Texas A&M code PDT](https://parasol.tamu.edu/asci/).
-
-As we explore new architectures and programming paradigms with Kripke, we will 
-be able to incorporate these findings and ideas into our larger codes. The main
-advantages of using Kripke for this exploration is that it's light-weight (ie. 
-easily refactored and modified), and it gets us closer to the real question we 
-want answered: "What is the best way to layout and implement an Sn code on a 
-given architecture+programming-model?" instead of the more commonly asked 
-question "What is the best way to map my existing Sn code to a given 
-architecture+programming-model?".
-
-
-Mini App or Proxy App?
-----------------------
-Kripke is a Mini-App since it has a very small code base consisting of 4184 
-lines of C++ code (generated using David A. Wheeler's SLOCCount v2.26).
-
-Kripke is also a Proxy-App since it is a proxy for the LLNL transport code 
-ARDRA.
-
-
-Analysis
---------
-A major challenge of achieving high-performance in an Sn transport (or any 
-physics) code is choosing a data-layout and a parallel decomposition that lends
-itself to the targeted architecture. Often the data-layout determines the most 
-efficient nesting of loops in computational kernels, which then determines how 
-well your inner-most-loop SIMDizes, how you add threading (pthreads, OpenMP, 
-etc.), and the efficiency and design of your parallel algorithms. Therefore, 
-each nesting produces different loop nesting orders, which provides 
-substantially different performance characteristics. We want to explore how 
-easily and efficiently these different nestings map to different architectures.
-In particular, we are interested in how we can achieve good parallel efficiency
-while also achieving efficient use of node resources (such as SIMD units, 
-memory systems, and accelerators).
-
-Parallel sweep algorithms can be explored with Kripke in multiple ways. The 
-core MPI algorithm could be modified or rewritten to explore other approaches, 
-domain overloading, or alternate programming models (such as Charm++). The 
-effect of load-imbalance is an understudied aspect of Sn transport sweeps, and 
-could easily be studied with Kripke by artificially adding more work (ie 
-unknowns) to a subset of MPI tasks. Block-AMR could be added to Kripke, which 
-would be a useful way to explore the cost-benefit analysis of adding AMR to an 
-Sn code, and would be a way to further study load imbalances and AMR effects 
-on sweeps.
-
-The coupling of on-node sweep kernel, the parallel sweep algorithm, and the 
-choices of decomposing the problem phase space into GS's, ZS's and DS's impact 
-the performance of the overall sweep. The tradeoff between large and small 
-"units of work" can be studied. Larger "units of work" provide more opportunity
-for on-node parallelism, while creating larger messages, less "sends", and less
-efficient parallel sweeps. Smaller "units of work" make for less efficient 
-on-node kernels, but more efficient parallel sweeps. 
-
-We can also study trading MPI tasks for threads, and the effects this has on 
-our programming models and cache efficiency.
-
-A simple timer infrastructure is provided that measure each compute kernels 
-total time.
-
-
-Physical Models
----------------
-
-Kripke solves the Discrete Ordinance and Diamond Difference discretized 
-steady-state linear Boltzmann equation. 
-
-        H * Psi = (LPlus * S * L) * Psi + Q
-
-Where:
-
-*   **Psi** is the unknown angular flux discretized over zones, directions, 
-    and energy groups
-
-*   **H** is the "streaming-collision" operator.  (Couples zones)
-
-*   **L** is the "discrete-to-moments operator. (Couples directions and moments)
-
-*   **LPlus** is the "moment-to-discrete" operator. 
-    (Couples directions and moments)
-
-*   **S** is the (arbitrary) order scattering operator. (Couples groups)
-
-*   **Q** is an external source. In Kripke it is represented in moment space, 
-    so really "LPlus*Q"
-
-
-Kripke is hard-coded to setup and solve the [3D Kobayashi radiation benchmark, 
-problem 3i](https://www.oecd-nea.org/science/docs/2000/nsc-doc2000-4.pdf).  
-Since Kripke does not have reflecting boundary conditions, the full-space model
-is solved. Command line arguments allow the user to modify the total and 
-scattering cross-sections.  Since Kripke is a multi-group transport code and 
-the Kobayashi problem is single-group, each energy group is setup to solve the 
-same problem with no group-to-group coupling in the data.
-
-
-The steady-state solution method uses the source-iteration technique, where 
-each iteration is as follows:
-
-1.  Phi = LTimes(Psi)
-2.  PhiOut = Scattering(Phi)
-3.  PhiOut = PhiOut + Source()
-4.  Rhs = LPlusTimes(PhiOut)
-5.  Psi = Sweep(Rhs, Psi)  which is solving Psi=(Hinverse * Rhs) a.k.a 
-    _"Inverting H"_
-
-
-
-Building and Running
-====================
-
-Kripke comes with a simple CMake based build system.
-
-Requirements
-------------
-*  CMake 3.0 or later
-*  C++ Compiler (g++, icpc, etc.)
-*  MPI 1.0 or later
-
-
-
-Quick Start
------------
-The easiest way to get Kripke running, is to directly invoke CMake and take 
-whatever system defaults you have for compilers and let CMake find MPI for you.
-
-*  Step 1:  Create a build space (assuming you are starting in the Kripke root 
-   directory)   
-        
-        mkdir build
-
-*  Step 2: Run CMake in that build space
-        
-        cd kripke
-        cmake ..
-
-*  Step 3: Now make Kripke:
-         
-        make -j8
-  
-*  Step 4: Run the test suite to make sure it works
-   
-        make test
-  
-*  Step 5: Run Kripke's default problem:
-   
-        ./kripke
-  
-
-Running Kripke
-==============
-
-Environment Variabes
---------------------
-
-If Kripke is build with OpenMP support, then the environment variables 
-``OMP_NUM_THREADS`` is used to control the number of OpenMP threads.  Kripke 
-does not attempt to modify the OpenMP runtime in anyway, so other ``OMP_*`` 
-environment variables should also work as well.
- 
-
-Command Line Options
---------------------
-Command line option help can also be viewed by running "./kripke --help"
-
-### Problem Size Options:
-
-*   **``--groups <ngroups>``**     
-
-    Number of energy groups. (Default: --groups 32)
-
-*   **``--legendre <lorder>``**    
-
-    Scattering Legendre Expansion Order (0, 1, ...).  (Default: --legendre 4)
-
-*   **``--quad <ndirs>``**, or **``--quad <polar>:<azim>``**
-
-    Define the quadrature set to use either a fake S2 with <ndirs> points, OR 
-		Gauss-Legendre with <polar> by <azim> points.   (Default: --quad 96)
-
-*   **``--zones <x>,<y>,<z>``**
-
-    Number of zones in x,y,z.  (Default: --zones 16,16,16)
-
-
-### Physics Parameters:
-
-*   **``--sigt <sigt0,sigt1,sigt2>``**
- 
-    Total material cross-sections.  (Default:   --sigt 0.1,0.0001,0.1)
-
-*   **``--sigs <sigs0,sigs1,sigs2>``**
- 
-    Total material cross-sections.  (Default:   --sigs 0.05,0.00005,0.05)
-
-
-### On-Node Options:
-
-*   **``--nest <NEST>``**
-
-    Loop nesting order (and data layout), available are DGZ, DZG, GDZ, GZD, 
-		ZDG, and ZGD. (Default: --nest DGZ)
-
-
-###Parallel Decomposition Options:
-
-*   **``--layout <lout>``**        
-    
-    Layout of spatial subdomains over mpi ranks. 0 for "Blocked" where local 
-		zone sets represent adjacent regions of space. 1 for "Scattered" where 
-		adjacent regions of space are distributed to adjacent MPI ranks. 
-		(Default: --layout 0)
-
-*   **--procs <npx,npy,npz>**  
-    
-    Number of MPI ranks in each spatial dimension. (Default:  --procs 1,1,1)
-
-*   **``--dset <ds>``**
-
-    Number of direction-sets.  Must be a factor of 8, and divide evenly the 
-		number of quadrature points. (Default:  --dset 8)
-
-*   **``--gset <gs>``**            
-    
-    Number of energy group-sets.  Must divide evenly the number energy groups. 
-		(Default:  --gset 1)
-
-*   **``--zset <zx>,<zy>,<zz>``**  
-    
-    Number of zone-sets in x, y, and z.  (Default:  --zset 1:1:1)
-
-
-###Solver Options:
-
-*   **``--niter <NITER>``**
-
-    Number of solver iterations to run. (Default:  --niter 10)
-
-*   **``--pmethod <method>``**     
-
-    Parallel solver method. "sweep" for full up-wind sweep (wavefront 
-		algorithm). "bj" for Block Jacobi.  (Default: --pmethod sweep)
-
-
-### Output and Testing Options:
-
-*   **``--test``**                 
-
-    Run Kernel Test instead of solve
-
-*   **``--silo <siloname>``**                 
-
-    Write SILO output (requires building with LLNL's Silo library)
-
-*   **``--papi <PAPI_XXX_XXX,...>``**
-
-    Track PAPI hardware counters for each timer. (requires building with 
-		PAPI library)
-    
-
-Test Suite
-----------
-
-Running with the ``--test`` command line argument will run a unit-testing frame
-work that will compare each kernel, using random input data, with the same 
-kernel from a different nesting.  This is very useful for checking correctness 
-of kernels after modification.
-
-Running ``make test`` will use the CMake testing framework, CTest, to run a 
-series of tests outlined in the root ``CMakeLists.txt`` file.
-
-
-Future Plans
-============
-
-Some ideas for future study:
-
-*   Block AMR.
-
-*   More FLOP intensive spatial discretizations such as DFEM's.
-
-*   Programming model abstractions
-
-
-Retirement
-==========
-
-Retirement of this Mini-App should be considered when it is no longer a 
-representative of state-of-the-art transport codes, or when it becomes too 
-cumbersome to adapt to advanced architectures. Also, at the point of 
-retirement it should be clear how to design its successor.
-
-
-Publications, Presentations, Links
-==================================
-
-*  [LLNL Codesign Website](https://codesign.llnl.gov/index.php)
-
-*  A. J. Kunen, T. S. Bailey, P. N. Brown, [KRIPKE- A Massively Parallel Transport Mini-App](https://codesign.llnl.gov/pdfs/Kripke_ANS_2015_Paper.pdf) American Nuclear Society M&C 2015,  April 21, 2015 (LLNL-CONF-675389)
-
-*  A. J. Kunen, [RAJA-Like Transformations in Kripke](https://codesign.llnl.gov/pdfs/TLoops.pdf), February 5, 2015 (LLNL-PRES-666686)
-
-*  A. J. Kunen,  [An Sn Transport Mini App](https://codesign.llnl.gov/pdfs/Kripke_Present.pdf), October 22, 2014 (LLNL-PRES-661866)
-
-
-
-Release
-=======
-LLNL-CODE-658597
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/CMakeLists.txt b/test/Kripke-v1.1/Kripke-v1.1-baseline/CMakeLists.txt
deleted file mode 100644
index a039b7e15..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-include_directories(.)
-
-add_subdirectory(Kripke)
-
-if(RAJA_ENABLE_CUDA)
-  cuda_add_executable(Kripke-v1.1-baseline.exe "Kripke.cpp")
-else()
-  add_executable(Kripke-v1.1-baseline.exe "Kripke.cpp")
-endif()
-
-target_link_libraries(Kripke-v1.1-baseline.exe ${KRIPKE_LIBS} ${KRIPKE_LIBS} ${RT_LIBRARIES})
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.cpp
deleted file mode 100644
index 280b5c254..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke.h>
-#include<Kripke/Input_Variables.h>
-#include<Kripke/Grid.h>
-#include<Kripke/Test/TestKernels.h>
-#include<stdio.h>
-#include<string.h>
-#include<algorithm>
-#include<string>
-#include<sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_OPENMP
-#include<omp.h>
-#endif
-
-#ifdef KRIPKE_USE_TCMALLOC
-#include<gperftools/malloc_extension.h>
-#endif
-
-#ifdef KRIPKE_USE_PERFTOOLS
-#include<gperftools/profiler.h>
-#endif
-
-#ifdef __bgq__
-#include </bgsys/drivers/ppcfloor/spi/include/kernel/location.h>
-#include </bgsys/drivers/ppcfloor/spi/include/kernel/memory.h>
-#endif
-
-
-void usage(void){
-  int myid=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-  if(myid == 0){
-    // Get a new object with defaulted values
-    Input_Variables def;
-    
-    // Display command line
-    printf("Usage:  [srun ...] kripke [options...]\n\n");
-    
-    // Display each option
-    printf("Problem Size Options:\n");
-    printf("---------------------\n");
-    
-    printf("  --groups <ngroups>     Number of energy groups\n");
-    printf("                         Default:  --groups %d\n\n", def.num_groups);
-    
-    printf("  --legendre <lorder>    Scattering Legendre Expansion Order (0, 1, ...)\n");
-    printf("                         Default:  --legendre %d\n\n", def.legendre_order);
-    
-    printf("  --quad [<ndirs>|<polar>:<azim>]\n");
-    printf("                         Define the quadrature set to use\n");
-    printf("                         Either a fake S2 with <ndirs> points,\n");
-    printf("                         OR Gauss-Legendre with <polar> by <azim> points\n");
-    printf("                         Default:  --quad %d\n\n", def.num_directions);
-    
-    
-    
-    printf("  --zones <x,y,z>        Number of zones in x,y,z\n");
-    printf("                         Default:  --zones %d,%d,%d\n\n", def.nx, def.ny, def.nz);
-    
-    
-    printf("\n");
-    printf("Physics Parameters:\n");
-    printf("-------------------\n");
-    printf("  --sigt <st0,st1,st2>   Total material cross-sections\n");
-    printf("                         Default:   --sigt %lf,%lf,%lf\n\n", def.sigt[0], def.sigt[1], def.sigt[2]);
-
-    printf("  --sigs <ss0,ss1,ss2>   Scattering material cross-sections\n");
-    printf("                         Default:   --sigs %lf,%lf,%lf\n\n", def.sigs[0], def.sigs[1], def.sigs[2]);
-
-
-    printf("\n");
-    printf("On-Node Options:\n");
-    printf("----------------\n");
-    printf("  --nest <NEST>          Loop nesting order (and data layout)\n");
-    printf("                         Available: DGZ,DZG,GDZ,GZD,ZDG,ZGD\n");
-    printf("                         Default:   --nest %s\n\n", nestingString(def.nesting).c_str());
-    
-    printf("\n");
-    printf("Parallel Decomposition Options:\n");
-    printf("-------------------------------\n");
-    printf("  --layout <lout>        Layout of spatial subdomains over mpi ranks\n");
-    printf("                         0: Blocked: local zone sets are adjacent\n");
-    printf("                         1: Scattered: adjacent zone sets are distributed\n");
-    printf("                         Default: --layout %d\n\n", def.layout_pattern);
-    
-    
-    printf("  --procs <npx,npy,npz>  Number of MPI ranks in each spatial dimension\n");
-    printf("                         Default:  --procs %d,%d,%d\n\n", def.npx, def.npy, def.npz);
-    
-    printf("  --dset <ds>            Number of direction-sets\n");
-    printf("                         Must be a factor of 8, and divide evenly the number\n");
-    printf("                         of quadrature points\n");
-    printf("                         Default:  --dset %d\n\n", def.num_dirsets);
-    
-    printf("  --gset <gs>            Number of energy group-sets\n");
-    printf("                         Must divide evenly the number energy groups\n");
-    printf("                         Default:  --gset %d\n\n", def.num_groupsets);
-    
-    printf("  --zset <zx>,<zy>,<zz>  Number of zone-sets in x,y, and z\n");
-    printf("                         Default:  --zset %d,%d,%d\n\n", def.num_zonesets_dim[0], def.num_zonesets_dim[1], def.num_zonesets_dim[2]);
-    
-    printf("\n");
-    printf("Solver Options:\n");
-    printf("---------------\n");
-    
-    printf("  --niter <NITER>        Number of solver iterations to run\n");
-    printf("                         Default:  --niter %d\n\n", def.niter);
-    
-    printf("  --pmethod <method>     Parallel solver method\n");
-    printf("                         sweep: Full up-wind sweep (wavefront algorithm)\n");
-    printf("                         bj: Block Jacobi\n");
-    printf("                         Default: --pmethod sweep\n\n");
-    
-    printf("\n");
-    printf("Output and Testing Options:\n");
-    printf("---------------------------\n");
-    
-#ifdef KRIPKE_USE_PAPI
-    printf("  --papi <PAPI_X_X,...>  Track PAPI hardware counters for each timer\n\n");
-#endif
-#ifdef KRIPKE_USE_SILO
-    printf("  --silo <BASENAME>      Create SILO output files\n\n");
-#endif
-    printf("  --test                 Run Kernel Test instead of solver\n\n");
-    printf("\n");
-  }
-#ifdef KRIPKE_USE_MPI
-  MPI_Finalize();
-#endif
-  exit(1);
-}
-
-struct CmdLine {
-  CmdLine(int argc, char **argv) :
-    size(argc-1),
-    cur(0),
-    args()
-  {
-    for(int i = 0;i < size;++ i){
-      args.push_back(argv[i+1]);
-    }
-  }
-
-  std::string pop(void){
-    if(atEnd())
-      usage();
-    return args[cur++];
-  }
-
-  bool atEnd(void){
-    return(cur >= size);
-  }
-
-  int size;
-  int cur;
-  std::vector<std::string> args;
-};
-
-std::vector<std::string> split(std::string const &str, char delim){
-  std::vector<std::string> elem;
-  std::stringstream ss(str);
-  std::string e;
-  while(std::getline(ss, e, delim)){
-    elem.push_back(e);
-  }
-  return elem;
-}
-
-
-namespace {
-  template<typename T>
-  std::string toString(T const &val){
-    std::stringstream ss;
-    ss << val;
-    return ss.str();
-  }
-}
-
-int main(int argc, char **argv) {
-  /*
-   * Initialize MPI
-   */
-
-  int myid=0;
-  int num_tasks=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-  MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
-#endif
-
-  if (myid == 0) {
-    /* Print out a banner message along with a version number. */
-    printf("\n");
-    printf("----------------------------------------------------------------------\n");
-    printf("------------------------ KRIPKE VERSION 1.1 --------------------------\n");
-    printf("----------------------------------------------------------------------\n");
-    printf("This work was produced at the Lawrence Livermore National Laboratory\n");
-    printf("(LLNL) under contract no. DE-AC-52-07NA27344 (Contract 44) between the\n");
-    printf("U.S. Department of Energy (DOE) and Lawrence Livermore National\n");
-    printf("Security, LLC (LLNS) for the operation of LLNL. The rights of the\n");
-    printf("Federal Government are reserved under Contract 44.\n");
-    printf("\n");
-    printf("Main Contact: Adam J. Kunen <kunen1@llnl.gov>\n");
-    printf("----------------------------------------------------------------------\n");
-   
-   
-    /* Print out some information about how OpenMP threads are being mapped
-     * to CPU cores.
-     */
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel
-    {
-      int tid = omp_get_thread_num();
-#ifdef __bgq__
-      int core = Kernel_ProcessorCoreID();
-#else
-      int core = sched_getcpu();
-#endif
-      printf("Rank: %d Thread %d: Core %d\n", myid, tid, core);
-    }
-#endif
-  }
-
-  /*
-   * Default input parameters
-   */
-  Input_Variables vars;
-  std::vector<std::string> papi_names;
-  bool test = false;
-  
-  /*
-   * Parse command line
-   */
-  CmdLine cmd(argc, argv);
-  while(!cmd.atEnd()){
-    std::string opt = cmd.pop();
-    if(opt == "-h" || opt == "--help"){usage();}
-    else if(opt == "--name"){vars.run_name = cmd.pop();}
-    else if(opt == "--dset"){
-      vars.num_dirsets = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--gset"){
-      vars.num_groupsets = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--zset"){
-      std::vector<std::string> nz = split(cmd.pop(), ',');
-      if(nz.size() != 3) usage();
-      vars.num_zonesets_dim[0] = std::atoi(nz[0].c_str());
-      vars.num_zonesets_dim[1] = std::atoi(nz[1].c_str());
-      vars.num_zonesets_dim[2] = std::atoi(nz[2].c_str());      
-    }
-    else if(opt == "--layout"){
-      vars.layout_pattern = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--zones"){
-      std::vector<std::string> nz = split(cmd.pop(), ',');
-      if(nz.size() != 3) usage();
-      vars.nx = std::atoi(nz[0].c_str());
-      vars.ny = std::atoi(nz[1].c_str());
-      vars.nz = std::atoi(nz[2].c_str());
-    }
-    else if(opt == "--procs"){
-      std::vector<std::string> np = split(cmd.pop(), ',');
-      if(np.size() != 3) usage();
-      vars.npx = std::atoi(np[0].c_str());
-      vars.npy = std::atoi(np[1].c_str());
-      vars.npz = std::atoi(np[2].c_str());
-    }
-    else if(opt == "--pmethod"){
-      std::string method = cmd.pop();
-      if(!strcasecmp(method.c_str(), "sweep")){
-        vars.parallel_method = PMETHOD_SWEEP;
-      }
-      else if(!strcasecmp(method.c_str(), "bj")){
-        vars.parallel_method = PMETHOD_BJ;
-      }
-      else{
-        usage();
-      }
-    }
-    else if(opt == "--groups"){
-      vars.num_groups = std::atoi(cmd.pop().c_str());      
-    }
-    else if(opt == "--quad"){
-      std::vector<std::string> p = split(cmd.pop(), ':');
-      if(p.size() == 1){
-        vars.num_directions = std::atoi(p[0].c_str());
-        vars.quad_num_polar = 0;
-        vars.quad_num_azimuthal = 0;
-      }
-      else if(p.size() == 2){
-        vars.quad_num_polar = std::atoi(p[0].c_str());
-        vars.quad_num_azimuthal = std::atoi(p[1].c_str());
-        vars.num_directions = vars.quad_num_polar * vars.quad_num_azimuthal;
-      }
-      else{
-        usage();
-      }
-    }
-    else if(opt == "--legendre"){
-      vars.legendre_order = std::atoi(cmd.pop().c_str());
-    }
-    else if(opt == "--sigs"){
-      std::vector<std::string> values = split(cmd.pop(), ',');
-      if(values.size()!=3)usage();
-      for(int mat = 0;mat < 3;++ mat){
-        vars.sigs[mat] = std::atof(values[mat].c_str());
-      }
-    }
-    else if(opt == "--sigt"){
-      std::vector<std::string> values = split(cmd.pop(), ',');
-      if(values.size()!=3)usage();
-      for(int mat = 0;mat < 3;++ mat){
-        vars.sigt[mat] = std::atof(values[mat].c_str());
-      }
-    }
-    else if(opt == "--niter"){
-      vars.niter = std::atoi(cmd.pop().c_str());
-    }
-    else if(opt == "--nest"){
-      vars.nesting = nestingFromString(cmd.pop());     
-    }
-#ifdef KRIPKE_USE_SILO
-    else if(opt == "--silo"){
-      vars.silo_basename = cmd.pop();
-    }
-#endif
-    else if(opt == "--test"){
-      test = true;
-    }
-#ifdef KRIPKE_USE_PAPI
-    else if(opt == "--papi"){
-      papi_names = split(cmd.pop(), ',');
-    }
-#endif
-    else{
-      printf("Unknwon options %s\n", opt.c_str());
-      usage();
-    }
-  }
-  
-  // Check that the input arguments are valid
-  if(vars.checkValues()){
-    exit(1);
-  }
-
-  /*
-   * Display Options
-   */
-  if (myid == 0) {
-    printf("Number of MPI tasks:   %d\n", num_tasks);
-#ifdef KRIPKE_USE_OPENMP
-    int num_threads=1;
-#pragma omp parallel
-    {
-      num_threads = omp_get_num_threads();
-      if(omp_get_thread_num() == 0){
-          printf("OpenMP threads/task:   %d\n", num_threads);
-          printf("OpenMP total threads:  %d\n", num_threads*num_tasks);
-        }
-    }
-#endif
-
-#ifdef KRIPKE_USE_PAPI
-    printf("PAPI Counters:         ");
-    if(papi_names.size() > 0){
-      for(int i = 0;i < papi_names.size();++ i){
-        printf("%s ", papi_names[i].c_str());
-      }
-    }
-    else{
-      printf("<none>");
-    }
-    printf("\n");
-#endif
-    printf("Processors:            %d x %d x %d\n", vars.npx, vars.npy, vars.npz);
-    printf("Zones:                 %d x %d x %d\n", vars.nx, vars.ny, vars.nz);
-    printf("Legendre Order:        %d\n", vars.legendre_order);
-    printf("Total X-Sec:           sigt=[%lf, %lf, %lf]\n", vars.sigt[0], vars.sigt[1], vars.sigt[2]);
-    printf("Scattering X-Sec:      sigs=[%lf, %lf, %lf]\n", vars.sigs[0], vars.sigs[1], vars.sigs[2]);
-    printf("Quadrature Set:        ");
-    if(vars.quad_num_polar == 0){
-      printf("Dummy S2 with %d points\n", vars.num_directions);
-    }
-    else {
-      printf("Gauss-Legendre, %d polar, %d azimuthal (%d points)\n", vars.quad_num_polar, vars.quad_num_azimuthal, vars.num_directions);
-    }
-    printf("Parallel method:       ");
-    if(vars.parallel_method == PMETHOD_SWEEP){
-      printf("Sweep\n");
-    }
-    else if(vars.parallel_method == PMETHOD_BJ){
-      printf("Block Jacobi\n");
-    }
-    printf("Loop Nesting Order     %s\n", nestingString(vars.nesting).c_str());        
-    printf("Number iterations:     %d\n", vars.niter);
-    
-    printf("GroupSet/Groups:       %d sets, %d groups/set\n", vars.num_groupsets, vars.num_groups/vars.num_groupsets);
-    printf("DirSets/Directions:    %d sets, %d directions/set\n", vars.num_dirsets, vars.num_directions/vars.num_dirsets);
-
-    printf("Zone Sets:             %d,%d,%d\n", vars.num_zonesets_dim[0], vars.num_zonesets_dim[1], vars.num_zonesets_dim[2]);
-
-    
-  }
-
-#ifdef KRIPKE_USE_PERFTOOLS
-  ProfilerStart("kripke.prof");
-#endif  
-
-  if(test){
-    // Invoke Kernel testing
-    testKernels(vars);
-  }
-  else{
-    // Allocate problem 
-    Grid_Data *grid_data = new Grid_Data(&vars);
-
-    grid_data->timing.setPapiEvents(papi_names);
-
-    // Run the solver
-    SweepSolver(grid_data, vars.parallel_method == PMETHOD_BJ);
-
-#ifdef KRIPKE_USE_SILO
-    // Output silo data
-    if(vars.silo_basename != ""){
-      grid_data->writeSilo(vars.silo_basename);
-    }
-#endif
-
-    // Print Timing Info
-    int myid=0;
-#ifdef KRIPKE_USE_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-    if(myid == 0){
-      grid_data->timing.print();
-      printf("\n\n");
-    }
-
-    // Cleanup 
-    delete grid_data;
-  }
-  
-#ifdef KRIPKE_USE_PERFTOOLS
-  ProfilerStop();
-#endif  
-
-  // Gather post-point memory info
-  double heap_mb = -1.0;
-  double hwm_mb = -1.0;
-#ifdef KRIPKE_USE_TCMALLOC
-  // If we are using tcmalloc, we need to use it's interface
-  MallocExtension *mext = MallocExtension::instance();
-  size_t bytes;
-
-  mext->GetNumericProperty("generic.current_allocated_bytes", &bytes);
-  heap_mb = ((double)bytes)/1024.0/1024.0;
-
-  mext->GetNumericProperty("generic.heap_size", &bytes);
-  hwm_mb = ((double)bytes)/1024.0/1024.0;
-#else
-#ifdef __bgq__
-  // use BG/Q specific calls (if NOT using tcmalloc)
-  uint64_t bytes;
-
-  int rc = Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &bytes);
-  heap_mb = ((double)bytes)/1024.0/1024.0;
-
-  rc = Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAPMAX, &bytes);
-  hwm_mb = ((double)bytes)/1024.0/1024.0;
-#endif
-#endif
-  // Print memory info
-  if(myid == 0 && heap_mb >= 0.0){
-    printf("Bytes allocated: %lf MB\n", heap_mb);
-    printf("Heap Size      : %lf MB\n", hwm_mb);
-
-  }
-  
-  // Cleanup and exit
-#ifdef KRIPKE_USE_MPI
-  MPI_Finalize();
-#endif
-
-  return (0);
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.h
deleted file mode 100644
index 6d32443bf..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_H__
-#define KRIPKE_H__
-
-#include<string>
-#include<vector>
-#include<stdio.h>
-#include<cmath>
-#include<strings.h>
-
-// Let the RAJA build system decide if we are using OpenMP
-#ifdef RAJA_ENABLE_OPENMP
-#define KRIPKE_USE_OPENMP
-#endif
-
-
-// Stubs for building without MPI
-#ifdef KRIPKE_USE_MPI
-
-#include<mpi.h>
-
-#define KripkeAbort(...) {printf(__VA_ARGS__); MPI_Abort(MPI_COMM_WORLD, 1);}
-
-#else
-
-
-#define KripkeAbort(...) {printf(__VA_ARGS__); exit(1);}
-
-#endif
-
-
-// Adopt RAJA's use of OPENMP
-#include<RAJA/RAJA.hxx>
-#ifndef KRIPKE_USE_OPENMP
-#ifdef RAJA_ENABLE_OPENMP
-#define KRIPKE_USE_OPENMP
-#endif
-#endif
-
-// Make sure that there's openmp support, otherwise error out
-#ifdef KRIPKE_USE_OPENMP
-#ifndef _OPENMP
-#error "OpenMP selected for build, but OpenMP is not available"
-#endif
-#endif
-
-// Forward Decl
-struct Grid_Data;
-
-#define KRESTRICT __restrict__
-
-
-// In Kripke/Sweep_Solver.cpp
-int SweepSolver(Grid_Data *grid_data, bool block_jacobi);
-void SweepSubdomains (std::vector<int> subdomain_list, Grid_Data *grid_data, bool block_jacobi);
-
-/**
- * Tags for choosing which data nesting to be chosen
- */
-enum Nesting_Order {
-  // Nestings for Psi and Phi
-  // D referes to directions OR moments, depending on context
-  NEST_DGZ,
-  NEST_DZG,
-  NEST_GDZ,
-  NEST_GZD,
-  NEST_ZDG,
-  NEST_ZGD
-};
-
-
-/**
-  Tags for which parallel algorithm to use.
-*/
-enum ParallelMethod {
-  PMETHOD_SWEEP,
-  PMETHOD_BJ
-};
-
-/**
- * Converts a nesting tag to a human-readable string.
- */
-inline std::string nestingString(Nesting_Order nesting){
-  switch(nesting){
-    case NEST_DGZ: return("DGZ");
-    case NEST_DZG: return("DZG");
-    case NEST_GDZ: return("GDZ");
-    case NEST_GZD: return("GZD");
-    case NEST_ZDG: return("ZDG");
-    case NEST_ZGD: return("ZGD");
-  }
-  return("UNKNOWN");
-}
-
-/**
- * Converts a string (eg. from command line) to a nesting tag.
- */
-inline Nesting_Order nestingFromString(std::string const &str){
-  for(int i = 0;i < 6;++ i){
-    if(!strcasecmp(str.c_str(), nestingString((Nesting_Order)i).c_str())){
-      return (Nesting_Order)i;
-  }
- }
-  return (Nesting_Order)-1;
-}
-
-
-/**
- * Compares two vectors for differences.
- * Used in testing suite.
- */
-inline bool compareVector(std::string const &name,
-    std::vector<double> const &a,
-    std::vector<double> const &b, double tol, bool verbose){
-
-  if(a.size() != b.size()){
-    if(verbose){
-      printf("Vectors are different lengths: %ld, %ld\n",
-          (long)a.size(), (long)b.size());
-    }
-    return true;
-  }
-
-  bool is_diff = false;
-  for(size_t i = 0;i < a.size();++i){
-    if(std::abs(a[i]-b[i]) > tol){
-      is_diff = true;
-      if(verbose){
-        printf("%s[%d]:%e, %e [%e]\n",
-            name.c_str(), (int)i,
-            a[i], b[i], std::abs(a[i]-b[i]));
-        is_diff = true;
-      }
-      else{
-        break;
-      }
-    }
-  }
-
-  return is_diff;
-}
-
-/**
- * Compares two scalars for differences.
- * Used in testing suite.
- */
-inline bool compareScalar(std::string const &name,
-    double a, double b, double tol, bool verbose){
-
-  if(std::abs(a-b) > tol){
-    if(verbose){
-      printf("%s:%e, %e [%e]\n",
-          name.c_str(),
-          a, b, std::abs(a-b));
-    }
-    return true;
-  }
-  return false;
-}
-
-#endif
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/CMakeLists.txt b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/CMakeLists.txt
deleted file mode 100644
index dbba7a0ab..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-include_directories(..)
-
-set(KRIPKE_SRC 
-  Directions.cpp  
-  Grid.cpp  
-  Input_Variables.cpp
-  Kernel.cpp  
-  Layout.cpp
-  Subdomain.cpp  
-  Sweep_Solver.cpp
-  ParallelComm.cpp  
-  Timing.cpp  
-    
-  Kernel/Kernel_3d_GDZ.cpp
-  Kernel/Kernel_3d_DGZ.cpp
-  Kernel/Kernel_3d_ZDG.cpp
-  Kernel/Kernel_3d_DZG.cpp
-  Kernel/Kernel_3d_ZGD.cpp
-  Kernel/Kernel_3d_GZD.cpp
-  
-  ParallelComm/BlockJacobiComm.cpp
-  ParallelComm/SweepComm.cpp 
-  
-  Test/TestKernels.cpp
-)
-
-if(RAJA_ENABLE_CUDA)
-  cuda_add_library(lib_kripke ${KRIPKE_SRC})  
-else()
-  add_library(lib_kripke ${KRIPKE_SRC})  
-endif() 
-                                                                                                          
-set(KRIPKE_LIBS ${KRIPKE_LIBS} lib_kripke PARENT_SCOPE) 
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.cpp
deleted file mode 100644
index a68e1a3ea..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Directions.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Input_Variables.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <float.h>
-#include <algorithm>
-
-namespace {
-  /*
-    GaussLegendre returns the n point Gauss-Legendre quadrature rule for
-    the integral between x1 and x2.
-  */
-  void GaussLegendre(double x1, double x2, std::vector<double> &x,
-      std::vector<double> &w, double eps)
-  {
-    int n = x.size();
-    int m, j, i;
-    double z1, z, xm, xl, pp, p3, p2, p1;
-
-    m=(n+1)/2;
-    xm=0.5*(x2+x1);
-    xl=0.5*(x2-x1);
-    for(i=1; i<=m; i++){
-      z=cos(M_PI*(i-0.25)/(n+0.5));
-      do {
-        p1=1.0;
-        p2=0.0;
-        for(j=1; j<=n; j++){
-          p3=p2;
-          p2=p1;
-          p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
-        }
-        pp=n*(z*p1-p2)/(z*z-1.0);
-        z1=z;
-        z=z1-p1/pp;
-      } while(fabs(z-z1) > eps);
-      x[i-1]=xm-xl*z;
-      x[n-i]=xm+xl*z;
-      w[i-1]=2.0*xl/((1.0-z*z)*pp*pp);
-
-      w[n-i]=w[i-1];
-    }
-  }
-
-
-  bool dirSortFcn(Directions const &a, Directions const &b){
-    return b.octant < a.octant;
-  }
-}
-
-/**
- * Initializes the quadrature set information for a Grid_Data object.
- * This guarantees that each <GS,DS> pair have a single originating octant.
- */
-void InitDirections(Grid_Data *grid_data, Input_Variables *input_vars)
-{
-  std::vector<Directions> &directions = grid_data->directions;
-
-  // Get set description from user
-  int num_directions_per_octant = input_vars->num_directions/8;
-  int num_directions = input_vars->num_directions;
-
-  // allocate storage
-  directions.resize(num_directions);
-
-  // Are we running a REAL quadrature set?
-  int num_polar = input_vars->quad_num_polar;
-  int num_azimuth = input_vars->quad_num_azimuthal;
-
-  std::vector<double> polar_cos;
-  std::vector<double> polar_weight;
-  if(num_polar > 0){
-    // make sure the user specified the correct number of quadrature points
-    if(num_polar % 4 != 0){
-      KripkeAbort("Must have number of polar angles be a multiple of 4\n");
-    }
-    if(num_azimuth % 2 != 0){
-      KripkeAbort("Must have number of azimuthal angles be a multiple of 2\n");
-    }
-    if(num_polar*num_azimuth != num_directions){
-      KripkeAbort("You need to specify %d total directions, not %d\n",
-          num_polar*num_azimuth, num_directions);
-    }
-
-    // Compute gauss legendre weights
-    polar_cos.resize(num_polar);
-    polar_weight.resize(num_polar);
-    GaussLegendre(-1.0, 1.0, polar_cos, polar_weight, DBL_EPSILON);
-
-    // compute azmuhtal angles and weights
-    std::vector<double> az_angle(num_azimuth);
-    std::vector<double> az_weight(num_azimuth);
-    double dangle = 2.0*M_PI/((double) num_azimuth);
-
-    for(int i=0; i<num_azimuth; i++){
-      if(i == 0){
-        az_angle[0] = dangle/2.0;
-      }
-      else{
-        az_angle[i] = az_angle[i-1] + dangle;
-      }
-      az_weight[i] = dangle;
-    }
-
-
-    // Loop over polar 'octants
-    int d = 0;
-    for(int i=0; i< num_polar; i++){
-      for(int j=0; j< num_azimuth; j++){
-        double xcos = sqrt(1.0-polar_cos[i]*polar_cos[i]) * cos(az_angle[j]);
-        double ycos = sqrt(1.0-polar_cos[i]*polar_cos[i]) * sin(az_angle[j]);
-        double zcos = polar_cos[i];
-        double w = polar_weight[i]*az_weight[j];
-
-        directions[d].id = (xcos > 0.) ? 1 : -1;
-        directions[d].jd = (ycos > 0.) ? 1 : -1;
-        directions[d].kd = (zcos > 0.) ? 1 : -1;
-
-        directions[d].octant = 0;
-        if(directions[d].id == -1){
-          directions[d].octant += 1;
-        }
-        if(directions[d].jd == -1){
-          directions[d].octant += 2;
-        }
-        if(directions[d].kd == -1){
-          directions[d].octant += 4;
-        }
-
-        directions[d].xcos = std::abs(xcos);
-        directions[d].ycos = std::abs(ycos);
-        directions[d].zcos = std::abs(zcos);
-        directions[d].w = w;
-
-        ++ d;
-      }
-    }
-
-    // Sort by octant.. so each set has same directions
-    std::sort(directions.begin(), directions.end(), dirSortFcn);
-  }
-  else{
-    // Do (essentialy) an S2 quadrature.. but with repeated directions
-
-    // Compute x,y,z cosine values
-    double mu  = cos(M_PI/4);
-    double eta = sqrt(1-mu*mu) * cos(M_PI/4);
-    double xi  = sqrt(1-mu*mu) * sin(M_PI/4);
-    int d = 0;
-    for(int octant = 0;octant < 8;++ octant){
-      double omegas[3];
-      omegas[0] = octant & 0x1;
-      omegas[1] = (octant>>1) & 0x1;
-      omegas[2] = (octant>>2) & 0x1;
-
-      for(int sd=0; sd<num_directions_per_octant; sd++, d++){
-        // Store which logical direction of travel we have
-        directions[d].id = (omegas[0] > 0.) ? 1 : -1;
-        directions[d].jd = (omegas[1] > 0.) ? 1 : -1;
-        directions[d].kd = (omegas[2] > 0.) ? 1 : -1;
-
-        // Store quadrature point's weight
-        directions[d].w = 4.0*M_PI / (double)num_directions;
-        directions[d].xcos = mu;
-        directions[d].ycos = eta;
-        directions[d].zcos = xi;
-      }
-    }
-  }
-}
-
-
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.h
deleted file mode 100644
index b0e228ad9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Directions.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_DIRECTIONS_H__
-#define KRIPKE_DIRECTIONS_H__
-
-#include <vector>
-
-struct Grid_Data;
-struct Input_Variables;
-
-/**
- * Contains information needed for one quadrature set direction.
- */
-struct Directions{
-  double xcos;              /* Absolute value of the x-direction cosine. */
-  double ycos;              /* Absolute value of the y-direction cosine. */
-  double zcos;              /* Absolute value of the z-direction cosine. */
-  double w;                 /* weight for the quadrature rule.*/
-  int id;                   /* direction flag (= 1 if x-direction
-                            cosine is positive; = -1 if not). */
-  int jd;                   /* direction flag (= 1 if y-direction
-                            cosine is positive; = -1 if not). */
-  int kd;                   /* direction flag (= 1 if z-direction
-                            cosine is positive; = -1 if not). */
-  int octant;
-};
-
-
-void InitDirections(Grid_Data *grid_data, Input_Variables *input_vars);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.cpp
deleted file mode 100644
index f336930c3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.cpp
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-
-#include <Kripke/Input_Variables.h>
-#include <Kripke/Layout.h>
-#include <Kripke/SubTVec.h>
-#include <cmath>
-#include <sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include <mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_SILO
-#include <sys/stat.h>
-#include <silo.h>
-#include <string.h>
-#endif
-
-/**
- * Grid_Data constructor
-*/
-Grid_Data::Grid_Data(Input_Variables *input_vars)
-{
-  // Create object to describe processor and subdomain layout in space
-  // and their adjacencies
-  Layout *layout = createLayout(input_vars);
-
-  // create the kernel object based on nesting
-  kernel = createKernel(input_vars->nesting, 3);
-
-  // Create quadrature set (for all directions)
-  int total_num_directions = input_vars->num_directions;
-  InitDirections(this, input_vars);
-
-  num_direction_sets = input_vars->num_dirsets;
-  num_directions_per_set = total_num_directions/num_direction_sets;
-  num_group_sets = input_vars->num_groupsets;
-  num_groups_per_set = input_vars->num_groups/ num_group_sets;
-  num_zone_sets = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    num_zone_sets *= input_vars->num_zonesets_dim[dim];
-  }
-
-  legendre_order = input_vars->legendre_order;
-  total_num_moments = (legendre_order+1)*(legendre_order+1);
-
-  int num_subdomains = num_direction_sets*num_group_sets*num_zone_sets;
-
-  Nesting_Order nest = input_vars->nesting;
-
-  /* Set ncalls */
-  niter = input_vars->niter;
-
-  // setup mapping of moments to legendre coefficients
-  moment_to_coeff.resize(total_num_moments);
-  int nm = 0;
-  for(int n = 0;n < legendre_order+1;++ n){
-    for(int m = -n;m <= n; ++ m){
-      moment_to_coeff[nm] = n;
-      ++ nm;
-    }
-  }
-
-  // setup cross-sections
-  int total_num_groups = num_group_sets*num_groups_per_set;
-  sigma_tot.resize(total_num_groups, 0.0);
-
-  // Setup scattering transfer matrix for 3 materials  
-
-  sigs = new SubTVec(kernel->nestingSigs(), total_num_groups*total_num_groups, legendre_order+1, 3);
-
-  // Set to isotropic scattering given user inputs
-  sigs->clear(0.0);
-  for(int mat = 0;mat < 3;++ mat){
-    for(int g = 0;g < total_num_groups;++ g){
-      int idx_g_gp = g*total_num_groups + g;
-      (*sigs)(idx_g_gp, 0, mat) = input_vars->sigs[mat];
-    }
-  }
-
-  // just allocate pointer vectors, we will allocate them below
-  ell.resize(num_direction_sets, NULL);
-  ell_plus.resize(num_direction_sets, NULL);
-  phi.resize(num_zone_sets, NULL);
-  phi_out.resize(num_zone_sets, NULL);
-
-  // Initialize Subdomains
-  zs_to_sdomid.resize(num_zone_sets);
-  subdomains.resize(num_subdomains);
-  for(int gs = 0;gs < num_group_sets;++ gs){
-    for(int ds = 0;ds < num_direction_sets;++ ds){
-      for(int zs = 0;zs < num_zone_sets;++ zs){
-        // Compupte subdomain id
-        int sdom_id = layout->setIdToSubdomainId(gs, ds, zs);
-
-        // Setup the subdomain
-        Subdomain &sdom = subdomains[sdom_id];
-        sdom.setup(sdom_id, input_vars, gs, ds, zs, directions, kernel, layout);
-
-        // Create ell and ell_plus, if this is the first of this ds
-        bool compute_ell = false;
-        if(ell[ds] == NULL){
-          ell[ds] = new SubTVec(kernel->nestingEll(), total_num_moments, sdom.num_directions, 1);
-          ell_plus[ds] = new SubTVec(kernel->nestingEllPlus(), total_num_moments, sdom.num_directions, 1);
-
-          compute_ell = true;
-        }
-
-        // Create phi and phi_out, if this is the first of this zs
-        if(phi[zs] == NULL){
-          phi[zs] = new SubTVec(nest, total_num_groups, total_num_moments, sdom.num_zones);
-          phi_out[zs] = new SubTVec(nest, total_num_groups, total_num_moments, sdom.num_zones);
-        }
-
-        // setup zs to sdom mapping
-        if(gs == 0 && ds == 0){
-          zs_to_sdomid[zs] = sdom_id;
-        }
-
-        // Set the variables for this subdomain
-        sdom.setVars(ell[ds], ell_plus[ds], phi[zs], phi_out[zs]);
-
-        if(compute_ell){
-          // Compute the L and L+ matrices
-          sdom.computeLLPlus(legendre_order);
-        }
-      }
-    }
-  }
-  delete layout;
-
-
-
-  // Now compute number of elements allocated globally,
-  // and get each materials volume
-  long long vec_size[4] = {0,0,0,0};
-  double vec_volume[3] = {0.0, 0.0, 0.0};
-  for(int sdom_id = 0;sdom_id < subdomains.size();++sdom_id){
-    Subdomain &sdom = subdomains[sdom_id];
-    vec_size[0] += sdom.psi->elements;
-    vec_size[1] += sdom.psi->elements;
-  }
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    vec_size[2] += phi[zs]->elements;
-    vec_size[3] += phi_out[zs]->elements;
-    int sdom_id = zs_to_sdomid[zs];
-    for(int mat = 0;mat < 3;++ mat){
-      vec_volume[mat] += subdomains[sdom_id].reg_volume[mat];
-    }
-  }
-
-
-#ifdef KRIPKE_USE_MPI
-  int mpi_rank;
-  double global_volume[3];
-  long long global_size[4];
-
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Reduce(vec_size, global_size, 4, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(vec_volume, global_volume, 3, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-#else
-  int mpi_rank = 0;
-  long long *global_size = vec_size;
-  double *global_volume = vec_volume;
-#endif
-
-  if(mpi_rank == 0){
-    printf("Unknown counts: psi=%ld, rhs=%ld, phi=%ld, phi_out=%ld\n",
-      (long)global_size[0], (long)global_size[1], (long)global_size[2], (long)global_size[3]);
-    printf("Region volumes: Reg1=%e, Reg2=%e, Reg3=%e\n",
-        global_volume[0], global_volume[1], global_volume[2]);
-  }
-}
-
-Grid_Data::~Grid_Data(){
-  delete kernel;
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    delete phi[zs];
-    delete phi_out[zs];
-  }
-  for(int ds = 0;ds < num_direction_sets;++ ds){
-    delete ell[ds];
-    delete ell_plus[ds];
-  }
-  delete sigs;
-}
-
-/**
- * Randomizes all variables and matrices for testing suite.
- */
-void Grid_Data::randomizeData(void){
-  for(int i = 0;i < sigma_tot.size();++i){
-    sigma_tot[i] = drand48();
-  }
-
-  for(int i = 0;i < directions.size();++i){
-    directions[i].xcos = drand48();
-    directions[i].ycos = drand48();
-    directions[i].zcos = drand48();
-  }
-
-
-  for(int s = 0;s < subdomains.size();++ s){
-    subdomains[s].randomizeData();
-  }
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    phi[zs]->randomizeData();
-    phi_out[zs]->randomizeData();
-  }
-
-  for(int ds = 0;ds < num_direction_sets;++ ds){
-    ell[ds]->randomizeData();
-    ell_plus[ds]->randomizeData();
-  }
-
-  sigs->randomizeData();
-}
-
-
-/**
- * Returns the integral of psi.. to look at convergence
- */
-double Grid_Data::particleEdit(void){
-  // sum up particles for psi and rhs
-  double part = 0.0;
-  for(int sdom_id = 0;sdom_id < subdomains.size();++ sdom_id){
-    Subdomain &sdom = subdomains[sdom_id];
-
-    int num_zones = sdom.num_zones;
-    int num_directions = sdom.num_directions;
-    int num_groups= sdom.num_groups;
-    Directions *dirs = sdom.directions;
-
-    for(int z = 0;z < num_zones;++ z){
-      double vol = sdom.volume[z];
-      for(int d = 0;d < num_directions;++ d){
-        double w = dirs[d].w;
-        for(int g = 0;g < num_groups;++ g){
-          part += w * (*sdom.psi)(g,d,z) * vol;
-        }
-      }
-    }
-  }
-
-  // reduce
-#ifdef KRIPKE_USE_MPI
-  double part_global;
-
-  MPI_Reduce(&part, &part_global, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-
-  return part_global;
-#else
-
-  return part;
-
-#endif
-}
-
-
-/**
- * Copies all variables and matrices for testing suite.
- * Correctly copies data from one nesting to another.
- */
-void Grid_Data::copy(Grid_Data const &b){
-  sigma_tot = b.sigma_tot;
-  directions = b.directions;
-
-  subdomains.resize(b.subdomains.size());
-  for(int s = 0;s < subdomains.size();++ s){
-    subdomains[s].copy(b.subdomains[s]);
-  }
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    phi[zs]->copy(*b.phi[zs]);
-    phi_out[zs]->copy(*b.phi_out[zs]);
-  }
-
-  for(int ds = 0;ds < ell.size();++ ds){
-    ell[ds]->copy(*b.ell[ds]);
-    ell_plus[ds]->copy(*b.ell_plus[ds]);
-  }
-
-  sigs->copy(*b.sigs);
-}
-
-/**
- * Compares all variables and matrices for testing suite.
- * Correctly compares data from one nesting to another.
- */
-bool Grid_Data::compare(Grid_Data const &b, double tol, bool verbose){
-  bool is_diff = false;
-
-  for(int i = 0;i < directions.size();++i){
-    std::stringstream dirname;
-    dirname << "directions[" << i << "]";
-
-    is_diff |= compareScalar(dirname.str()+".xcos",
-        directions[i].xcos, b.directions[i].xcos, tol, verbose);
-
-    is_diff |= compareScalar(dirname.str()+".ycos",
-        directions[i].ycos, b.directions[i].ycos, tol, verbose);
-
-    is_diff |= compareScalar(dirname.str()+".zcos",
-        directions[i].zcos, b.directions[i].zcos, tol, verbose);
-  }
-
-  for(int s = 0;s < subdomains.size();++ s){
-    is_diff |= subdomains[s].compare(
-        b.subdomains[s], tol, verbose);
-
-  }
-  is_diff |= compareVector("sigma_tot", sigma_tot, b.sigma_tot, tol, verbose);
-
-  for(int zs = 0;zs < num_zone_sets;++ zs){
-    is_diff |= phi[zs]->compare("phi", *b.phi[zs], tol, verbose);
-    is_diff |= phi_out[zs]->compare("phi_out", *b.phi_out[zs], tol, verbose);
-  }
-
-  for(int ds = 0;ds < ell.size();++ ds){
-    is_diff |= ell[ds]->compare("ell", *b.ell[ds], tol, verbose);
-    is_diff |= ell_plus[ds]->compare("ell_plus", *b.ell_plus[ds], tol, verbose);
-  }
-
-  is_diff |= sigs->compare("sigs", *b.sigs, tol, verbose);
-
-  return is_diff;
-}
-
-
-#ifdef KRIPKE_USE_SILO
-
-enum MultivarType {
-  MULTI_MESH,
-  MULTI_MAT,
-  MULTI_VAR
-};
-
-namespace {
-  /**
-    Writes a multimesh or multivar to the root file.
-  */
-
-  void siloWriteMulti(DBfile *root, MultivarType mv_type,
-    std::string const &fname_base, std::string const &var_name,
-    std::vector<int> sdom_id_list, int var_type = 0)
-  {
-    int mpi_size;
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-    int num_sdom = sdom_id_list.size();
-
-    // setup names and types
-    std::vector<int> var_types(mpi_size*num_sdom, var_type);
-    std::vector<char *> var_names(mpi_size*num_sdom);
-    int var_idx = 0;
-    for(int rank = 0;rank < mpi_size;++ rank){
-      for(int idx = 0;idx < num_sdom;++ idx){
-        int sdom_id = sdom_id_list[idx];
-        std::stringstream name;
-        name << fname_base << "/rank_" << rank << ".silo:/sdom" << sdom_id << "/" << var_name;
-        var_names[var_idx] = strdup(name.str().c_str());
-        var_idx ++;
-      }
-    }
-
-    if(mv_type == MULTI_MESH){
-      DBPutMultimesh(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0], &var_types[0], NULL);
-    }
-    else if(mv_type == MULTI_MAT){
-      DBPutMultimat(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0],  NULL);
-    }
-    else{
-      DBPutMultivar(root, var_name.c_str(), mpi_size*num_sdom,
-          &var_names[0],  &var_types[0] , NULL);
-    }
-
-    // cleanup
-    for(int i = 0;i < mpi_size*num_sdom; ++i){
-      free(var_names[i]);
-    }
-  }
-
-  void siloWriteRectMesh(DBfile *silo_file,
-    std::string const &mesh_name,
-    int const *nzones,
-    double const *zeros,
-    double const *deltas_x,
-    double const *deltas_y,
-    double const *deltas_z)
-  {
-    static char const *coordnames[3] = {"X", "Y", "Z"};
-    double const *deltas[3] = {deltas_x, deltas_y, deltas_z};
-    double *coords[3];
-    for(int dim = 0;dim < 3;++ dim){
-      coords[dim] = new double[nzones[dim]];
-      coords[dim][0] = zeros[dim];
-      for(int z = 0;z < nzones[dim];++ z){
-        coords[dim][1+z] = coords[dim][z] + deltas[dim][z];
-      }
-    }
-    int nnodes[3] = {
-      nzones[0]+1,
-      nzones[1]+1,
-      nzones[2]+1
-    };
-
-    DBPutQuadmesh(silo_file, mesh_name.c_str(), const_cast<char**>(coordnames), coords, nnodes, 3, DB_DOUBLE,
-        DB_COLLINEAR, NULL);
-
-    // cleanup
-    delete[] coords[0];
-    delete[] coords[1];
-    delete[] coords[2];
-  }
-
-
-} //namespace
-
-
-void Grid_Data::writeSilo(std::string const &fname_base){
-
-  // Recompute Phi... so we can write out phi0
-  kernel->LTimes(this);
-
-  int mpi_rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
-  if(mpi_rank == 0){
-    // Create a root file
-    std::string fname_root = fname_base + ".silo";
-    DBfile *root = DBCreate(fname_root.c_str(),
-        DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
-
-    // Write out multimesh and multivars
-    siloWriteMulti(root, MULTI_MESH, fname_base, "mesh", zs_to_sdomid, DB_QUAD_RECT);
-    siloWriteMulti(root, MULTI_MAT, fname_base, "material", zs_to_sdomid);
-    siloWriteMulti(root, MULTI_VAR, fname_base, "phi0", zs_to_sdomid, DB_QUADVAR);
-
-    // Close root file
-    DBClose(root);
-
-    // Create a subdirectory to hold processor info
-    mkdir(fname_base.c_str(), 0750);
-  }
-
-  // Sync up, so everyone sees the subdirectory
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // Create our processor file
-  std::stringstream ss_proc;
-  ss_proc << fname_base << "/rank_" << mpi_rank << ".silo";
-  DBfile *proc = DBCreate(ss_proc.str().c_str(),
-      DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
-
-  // Write out data for each subdomain
-  int num_zone_sets = zs_to_sdomid.size();
-  for(int sdom_idx = 0;sdom_idx < num_zone_sets;++ sdom_idx){
-    int sdom_id = zs_to_sdomid[sdom_idx];
-    Subdomain &sdom = subdomains[sdom_id];
-
-    // Create a directory for the subdomain
-    std::stringstream dirname;
-    dirname << "/sdom" << sdom_id;
-    DBMkDir(proc, dirname.str().c_str());
-
-    // Set working directory
-    DBSetDir(proc, dirname.str().c_str());
-
-    // Write the mesh
-    siloWriteRectMesh(proc, "mesh", sdom.nzones, sdom.zeros,
-      &sdom.deltas[0][1], &sdom.deltas[1][1], &sdom.deltas[2][1]);
-
-
-    // Write the material
-    {
-      int num_zones = sdom.num_zones;
-      int num_mixed = sdom.mixed_material.size();
-      int matnos[3] = {1, 2, 3};
-      std::vector<int> matlist(num_zones, 0);
-      std::vector<int> mix_next(num_mixed, 0);
-      std::vector<int> mix_mat(num_mixed, 0);
-
-      // setup matlist and mix_next arrays
-      int last_z = -1;
-      for(int m = 0;m < num_mixed;++ m){
-        mix_mat[m] = sdom.mixed_material[m] + 1;
-        int z = sdom.mixed_to_zones[m];
-        if(matlist[z] == 0){
-            matlist[z] = -(1+m);
-        }
-        // if we are still on the same zone, make sure the last mix points
-        // here
-        if(z == last_z){
-          mix_next[m-1] = m+1;
-        }
-        last_z = z;
-      }
-
-      DBPutMaterial(proc, "material", "mesh", 3, matnos,
-          &matlist[0], sdom.nzones, 3,
-          &mix_next[0], &mix_mat[0], &sdom.mixed_to_zones[0], &sdom.mixed_fraction[0], num_mixed,
-          DB_DOUBLE, NULL);
-    }
-
-    // Write phi0
-    {
-
-      int num_zones = sdom.num_zones;
-      std::vector<double> phi0(num_zones);
-
-      // extract phi0 from phi for the 0th group
-      for(int z = 0;z < num_zones;++ z){
-        phi0[z] = (*sdom.phi)(0,0,z);
-      }
-
-      DBPutQuadvar1(proc, "phi0", "mesh", &phi0[0],
-          sdom.nzones, 3, NULL, 0, DB_DOUBLE, DB_ZONECENT, NULL);
-    }
-  }
-
-  // Close processor file
-  DBClose(proc);
-}
-#endif
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.h
deleted file mode 100644
index ff539b866..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Grid.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_GRID_DATA_H__
-#define KRIPKE_GRID_DATA_H__
-
-#include <Kripke.h>
-#include <Kripke/Directions.h>
-#include <Kripke/Kernel.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/Timing.h>
-#include <vector>
-
-// Foreward Decl
-struct Input_Variables;
-struct SubTVec;
-
-
-/**
- * Contains all grid parameters and variables.
- */
-struct Grid_Data {
-public:
-  explicit Grid_Data(Input_Variables *input_vars);
-  ~Grid_Data();
-
-  void randomizeData(void);
-  void copy(Grid_Data const &b);
-  bool compare(Grid_Data const &b, double tol, bool verbose);
-  double particleEdit(void);
-#ifdef KRIPKE_USE_SILO
-  void writeSilo(std::string const &fname);
-#endif
-
-  Timing timing;
-
-  int niter;
-
-  double source_value;
-
-  std::vector<double> sigma_tot;            // Cross section data
-
-  int num_group_sets;                       // Number of group-sets
-  int num_groups_per_set;                   // How many groups in each set
-  int num_direction_sets;                   // Number of direction-sets
-  int num_directions_per_set;               // Number of directions per dir set
-  int num_zone_sets;                        // Number of zone sets
-  int legendre_order;                       // Legendra expansion order ( >= 0 )
-  int total_num_moments;                    // Number of spherical harmonic moments
-
-  std::vector<int> moment_to_coeff;         // Map from harmonic moments to legendre coefficients
-
-  std::vector<Directions> directions;       // Quadrature point data, for all directions
-  Kernel *kernel;                           // Layout-specific math kernels
-
-  std::vector<Subdomain> subdomains;        // Group/Angle/Zone set data
-  std::vector<int> zs_to_sdomid;            // map of zonesets to subdomains with ds=gs=0
-
-  // Variables:
-  SubTVec *sigs;                            // scattering lookup table for each material
-                                            // G=g->gp, D=legendre coeff, Z=matidx
-
-  // Per directionset ell and ell_plus matrices (Subdomain point into these arrays)
-  std::vector<SubTVec *> ell;               // L matrix in nm_offset coordinates
-  std::vector<SubTVec *> ell_plus;          // L+ matrix in nm_offset coordinates
-
-  // Per zoneset phi and phi_out (Subdomains point into these arrays)
-  std::vector<SubTVec *> phi;               // Moments of psi
-  std::vector<SubTVec *> phi_out;           // Scattering source
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.cpp
deleted file mode 100644
index b1f13d4b0..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Input_Variables.h>
-
-#ifdef KRIPKE_US_MPI
-#include<mpi.h>
-#endif
-
-/**
-* Setup the default input choices
-*/
-Input_Variables::Input_Variables() : 
-  nx(16), ny(16), nz(16),
-  num_directions(96),
-  num_groups(32),
-  legendre_order(4),
-  quad_num_polar(0),
-  quad_num_azimuthal(0),
- 
-  nesting(NEST_DGZ),
- 
-  npx(1), npy(1), npz(1),
-  num_dirsets(8),
-  num_groupsets(2),
-  layout_pattern(0),
-  
-  niter(10),
-  parallel_method(PMETHOD_SWEEP),
-  run_name("kripke")
-{
-  num_zonesets_dim[0] = 1; 
-  num_zonesets_dim[1] = 1;
-  num_zonesets_dim[2] = 1;
-
-  sigt[0] = 0.1;  
-  sigt[1] = 0.0001;
-  sigt[2] = 0.1;
-  
-  sigs[0] = 0.05;  
-  sigs[1] = 0.00005;
-  sigs[2] = 0.05; 
-}
-
-/**
- *  Checks validity of inputs, returns 'true' on error.
- */
-bool Input_Variables::checkValues(void) const{
-  // make sure any output only goes to root
-#ifdef KRIPKE_USE_MPI
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-#else
-  int rank = 0;
-#endif
-
-  if(num_zonesets_dim[0] <= 0 || num_zonesets_dim[1] <= 0 || num_zonesets_dim[2] <= 0){
-    if(!rank)
-      printf("Number of zone-sets in each dim need to be greater than or equal to 1\n");
-    return true;
-  }
-  
-  if(layout_pattern < 0 || layout_pattern > 1){
-    if(!rank)
-      printf("Layout(%d) must be either 0 or 1\n", layout_pattern);
-    return true;
-  }
-  
-  if(nesting < 0){
-    if(!rank)
-      printf("Invalid nesting selected\n");
-    return true;
-  }
-  
-  if(num_groups < 1){
-    if(!rank)
-      printf("Number of groups (%d) needs to be at least 1\n", num_groups);
-    return true;
-  }
-  
-  if(num_groups % num_groupsets){
-    if(!rank)
-      printf("Number of groups (%d) must be evenly divided by number of groupsets (%d)\n",
-        num_groups, num_groupsets);
-    return true;
-  }
-  
-  if(num_directions < 8){
-    if(!rank)
-      printf("Number of directions (%d) needs to be at least 8\n", num_directions);
-    return true;
-  }
-  
-  if(num_dirsets % 8 && num_dirsets < 8){
-    if(!rank)
-      printf("Number of direction sets (%d) must be a multiple of 8\n", num_dirsets);
-    return true;
-  }
-  
-  if(num_directions % num_dirsets){
-    if(!rank)
-      printf("Number of directions (%d) must be evenly divided by number of directionsets(%d)\n",
-        num_directions, num_dirsets);
-    return true;
-  }
-  
-  if(legendre_order < 0){
-    if(!rank)
-      printf("Legendre scattering order (%d) must be >= 0\n", legendre_order);
-    return true;
-  }
-  
-  if(niter < 1){
-    if(!rank)
-      printf("You must run at least one iteration (%d)\n", niter);
-    return true;
-  }
-  
-  return false;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.h
deleted file mode 100644
index 9d3f40573..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Input_Variables.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_INPUT_VARIABLES_H__
-#define KRIPKE_INPUT_VARIABLES_H__
-
-#include<Kripke.h>
-
-/**
- * This structure defines the input parameters to setup a problem.
- */
-
-struct Input_Variables {
-  Input_Variables();
-  
-  bool checkValues(void) const;
-  
-  // Problem Description
-  int nx, ny, nz;               // Number of spatial zones in x,y,z
-  int num_directions;           // Total number of directions
-  int num_groups;               // Total number of energy groups
-  int legendre_order;           // Scattering order (number Legendre coeff's - 1)
-  int quad_num_polar;           // Number of polar quadrature points (0 for dummy)
-  int quad_num_azimuthal;       // Number of azimuthal quadrature points (0 for dummy)
-
-  // On-Node Options
-  Nesting_Order nesting;        // Data layout and loop ordering (of Psi)
-  
-  // Parallel Decomp
-  int npx, npy, npz;            // The number of processors in x,y,z
-  int num_dirsets;              // Number of direction sets
-  int num_groupsets;            // Number of energy group sets
-  int num_zonesets_dim[3];      // Number of zoneset in x, y, z  
-  int layout_pattern;           // Which subdomain/task layout to use
-  
-  // Physics and Solver Options
-  int niter;                    // number of solver iterations to run
-  ParallelMethod parallel_method;
-  double sigt[3];               // total cross section for 3 materials
-  double sigs[3];               // total scattering cross section for 3 materials
-  
-  // Output Options
-  std::string run_name;         // Name to use when generating output files
-#ifdef KRIPKE_USE_SILO
-  std::string silo_basename;    // name prefix for silo output files
-#endif
-
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.cpp
deleted file mode 100644
index b47cf201d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-#include<Kripke/Kernel/Kernel_3d_GDZ.h>
-#include<Kripke/Kernel/Kernel_3d_DGZ.h>
-#include<Kripke/Kernel/Kernel_3d_ZDG.h>
-#include<Kripke/Kernel/Kernel_3d_DZG.h>
-#include<Kripke/Kernel/Kernel_3d_ZGD.h>
-#include<Kripke/Kernel/Kernel_3d_GZD.h>
-
-Kernel::~Kernel(){}
-
-/**
- * Factory to create a kernel object for the specified nesting
- */
-Kernel *createKernel(Nesting_Order nest, int num_dims){
-  if(num_dims == 3){
-    switch(nest){
-    case NEST_GDZ:
-      return new Kernel_3d_GDZ();
-    case NEST_DGZ:
-      return new Kernel_3d_DGZ();
-    case NEST_ZDG:
-      return new Kernel_3d_ZDG();
-    case NEST_DZG:
-      return new Kernel_3d_DZG();
-    case NEST_ZGD:
-      return new Kernel_3d_ZGD();
-    case NEST_GZD:
-      return new Kernel_3d_GZD();
-    }
-  }
-
-	KripkeAbort("Unknown nesting order %d\n", (int)nest);
-  return NULL;
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.h
deleted file mode 100644
index 733ddd616..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_H__
-#define KRIPKE_KERNEL_H__
-
-#include <Kripke.h>
-
-struct Grid_Data;
-struct SubTVec;
-struct Subdomain;
-
-/**
- * This is the Kernel base-class and interface definition.
- * This abstracts the storage of Psi, Phi, L, L+ from the rest of the code,
- * providing data-layout specific routines.
- */
-class Kernel {
-  public:
-		virtual ~Kernel();
-
-    virtual Nesting_Order nestingPsi(void) const = 0;
-    virtual Nesting_Order nestingPhi(void) const = 0;
-    virtual Nesting_Order nestingSigt(void) const = 0;
-    virtual Nesting_Order nestingEll(void) const = 0;
-    virtual Nesting_Order nestingEllPlus(void) const = 0;
-    virtual Nesting_Order nestingSigs(void) const = 0;
-
-    // Computational Kernels
-    virtual void LTimes(Grid_Data *grid_data) = 0;
-    virtual void LPlusTimes(Grid_Data *grid_data) = 0;
-    virtual void scattering(Grid_Data *grid_data) = 0;
-    virtual void source(Grid_Data *grid_data) = 0;
-    virtual void sweep(Subdomain *ga_set) = 0;
-};
-
-
-// Factory to create correct kernel object
-Kernel *createKernel(Nesting_Order, int num_dims);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.cpp
deleted file mode 100644
index 7dc469700..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_DGZ.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_DGZ::~Kernel_3d_DGZ() {}
-
-Nesting_Order Kernel_3d_DGZ::nestingPsi(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingPhi(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_DGZ::nestingSigs(void) const {
-  return NEST_DGZ;
-}
-
-
-void Kernel_3d_DGZ::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Zero Phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_groups = sdom.phi->groups;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gz = num_groups*num_zones;
-    int num_locgz = num_local_groups*num_zones;
-    
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-    
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel
-#endif
-		{
-			for(int nm = 0;nm < num_moments;++nm){
-				double const * KRESTRICT ell_nm = ell + nm*num_local_directions;      
-				double       * KRESTRICT phi_nm = phi + nm*num_gz + group0*num_zones;
-				
-				for (int d = 0; d < num_local_directions; d++) {
-					double const * KRESTRICT psi_d = psi + d*num_locgz;
-					double const             ell_nm_d = ell_nm[d];
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp for nowait
-#endif
-					for(int gz = 0;gz < num_locgz; ++ gz){
-						phi_nm[gz] += ell_nm_d * psi_d[gz];
-					}
-				}     
-			}
-		} 
-	}
-}
-
-void Kernel_3d_DGZ::LPlusTimes(Grid_Data *grid_data) {
-	// Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int num_groups = sdom.phi_out->groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_groups_zones = num_local_groups*num_zones;
-    
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr() + group0*num_zones;
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel
-#endif
-		{
-			for (int d = 0; d < num_local_directions; d++) {      
-				double       * KRESTRICT rhs_d = rhs + d*num_groups_zones;
-				double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-				
-				for(int nm = 0;nm < num_moments;++nm){
-					double const ell_plus_d_nm = ell_plus_d[nm];
-					double const * KRESTRICT phi_out_nm = phi_out + nm*num_groups*num_zones;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp for nowait
-#endif
-					for(int gz = 0;gz < num_groups_zones; ++ gz){
-						rhs_d[gz] += ell_plus_d_nm * phi_out_nm[gz];
-					}
-				}
-			}
-		}
-	}
-}
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-void Kernel_3d_DGZ::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr(); 
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    //int num_mixed = sdom.mixed_to_zones.size();
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_gz = num_groups*num_zones;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel
-#endif
-		{
-			for(int nm = 0;nm < num_moments;++ nm){
-				// map nm to n
-				int n = moment_to_coeff[nm];
-				double const * KRESTRICT sigs_n = sigs + n*3*num_groups*num_groups;
-				double const * KRESTRICT phi_nm = phi + nm*num_gz;
-				double       * KRESTRICT phi_out_nm = phi_out + nm*num_gz;
-
-				for(int g = 0;g < num_groups;++ g){      
-					double const * KRESTRICT sigs_n_g = sigs_n + g*3*num_groups;
-					double const * KRESTRICT phi_nm_g = phi_nm + g*num_zones;
-                
-					for(int gp = 0;gp < num_groups;++ gp){
-						double const * KRESTRICT sigs_n_g_gp = sigs_n_g + gp*3;
-						double       * KRESTRICT phi_out_nm_gp = phi_out_nm + gp*num_zones;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp for nowait
-#endif
-						for(int zone = 0;zone < num_zones;++ zone){
-							double phi_out_nm_gp_z = 0.0;
-							int mix_start = zones_to_mixed[zone];
-							int mix_stop = mix_start + num_mixed[zone];
-
-							for(int mix = mix_start;mix < mix_stop;++ mix){
-								int material = mixed_material[mix];
-								double fraction = mixed_fraction[mix];
-
-								phi_out_nm_gp_z += sigs_n_g_gp[material] * phi_nm_g[zone] * fraction;
-							}
-							phi_out_nm_gp[zone] += phi_out_nm_gp_z;
-						}
-					}
-        }        
-      }
-    }
-  }
-}
-
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_DGZ::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get the phi and phi out references
-    SubTVec &phi_out = *grid_data->phi_out[zs];
-
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT mixed_to_zones = &sdom.mixed_to_zones[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out_nm0 = phi_out.ptr();
-
-    // grab dimensions
-    int num_mixed = sdom.mixed_to_zones.size();
-    int num_zones = sdom.num_zones;
-    int num_groups = phi_out.groups;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for collapse(2)
-#endif
-    for(int g = 0;g < num_groups;++ g){
-      for(int mix = 0;mix < num_mixed;++ mix){
-      	double       * KRESTRICT phi_out_nm0_g = phi_out_nm0 + g*num_zones;
-        int zone = mixed_to_zones[mix];
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-
-        if(material == 0){
-          phi_out_nm0_g[zone] += 1.0 * fraction;
-        }
-      }
-    }
-  }
-}
-
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-
-void Kernel_3d_DGZ::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-  int num_zones = sdom->num_zones;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-  int local_kmax = sdom->nzones[2];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-  
-  int num_gz = num_groups * num_zones;
-  int num_gz_i = local_jmax * local_kmax * num_groups;
-  int num_gz_j = local_imax * local_kmax * num_groups;
-  int num_gz_k = local_imax * local_jmax * num_groups;
-  int num_z_i = local_jmax * local_kmax;
-  int num_z_j = local_imax * local_kmax;
-  int num_z_k = local_imax * local_jmax;
-
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-  for (int d = 0; d < num_directions; ++d) {
-    double xcos = 2.0 * direction[d].xcos;
-    double ycos = 2.0 * direction[d].ycos;
-    double zcos = 2.0 * direction[d].zcos;
-    
-    double       * KRESTRICT psi_d  = psi  + d*num_gz;
-    double const * KRESTRICT rhs_d  = rhs  + d*num_gz;
-
-    double       * KRESTRICT psi_lf_d = psi_lf + d*num_gz_i;
-    double       * KRESTRICT psi_fr_d = psi_fr + d*num_gz_j;
-    double       * KRESTRICT psi_bo_d = psi_bo + d*num_gz_k;
-
-    for (int g = 0; g < num_groups; ++g) {
-      double const * KRESTRICT sigt_g  = sigt + g*num_zones;
-      double       * KRESTRICT psi_d_g = psi_d + g*num_zones;
-      double const * KRESTRICT rhs_d_g = rhs_d + g*num_zones;
-      
-      double       * KRESTRICT psi_lf_d_g = psi_lf_d + g*num_z_i;
-      double       * KRESTRICT psi_fr_d_g = psi_fr_d + g*num_z_j;
-      double       * KRESTRICT psi_bo_d_g = psi_bo_d + g*num_z_k;
-
-      for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {       
-        double zcos_dzk = zcos / dz[k + 1];
-        
-        for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-          double ycos_dyj = ycos / dy[j + 1];
-          
-          for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-            double xcos_dxi = xcos / dx[i + 1];
-            
-            int z_idx = Zonal_INDEX(i, j, k);
-            int z_i = I_PLANE_INDEX(j, k);
-            int z_j = J_PLANE_INDEX(i, k);
-            int z_k = K_PLANE_INDEX(i, j);
-
-            /* Calculate new zonal flux */
-            double psi_d_g_z = (rhs_d_g[z_idx]
-                + psi_lf_d_g[z_i] * xcos_dxi
-                + psi_fr_d_g[z_j] * ycos_dyj
-                + psi_bo_d_g[z_k] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_g[z_idx]);
-
-            psi_d_g[z_idx] = psi_d_g_z;
-            
-            /* Apply diamond-difference relationships */
-            psi_lf_d_g[z_i] = 2.0 * psi_d_g_z - psi_lf_d_g[z_i];
-            psi_fr_d_g[z_j] = 2.0 * psi_d_g_z - psi_fr_d_g[z_j];
-            psi_bo_d_g[z_k] = 2.0 * psi_d_g_z - psi_bo_d_g[z_k];
-          }
-        }
-      }
-    } // group
-  } // direction
-
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.h
deleted file mode 100644
index c43ae2381..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DGZ.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_DGZ_H__
-#define KRIPKE_KERNEL_3D_DGZ_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_DGZ : public Kernel {
-  public:
-    virtual ~Kernel_3d_DGZ();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.cpp
deleted file mode 100644
index a19a01292..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_DZG.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_DZG::~Kernel_3d_DZG() {}
-
-Nesting_Order Kernel_3d_DZG::nestingPsi(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingPhi(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_DZG::nestingSigs(void) const {
-  return NEST_DZG;
-}
-
-
-void Kernel_3d_DZG::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Zero Phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_groups = sdom.phi->groups;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gz = num_groups*num_zones;
-    int num_locgz = num_local_groups*num_zones;
-    
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-    
-    for(int nm = 0;nm < num_moments;++nm){
-      double const * KRESTRICT ell_nm = ell + nm*num_local_directions;      
-      double       * KRESTRICT phi_nm = phi + nm*num_gz;
-      
-      for (int d = 0; d < num_local_directions; d++) {
-        double const * KRESTRICT psi_d = psi + d*num_locgz;
-        double const             ell_nm_d = ell_nm[d];
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-        for (int z = 0;z < num_zones;++ z){               
-          double const * KRESTRICT psi_d_z  = psi_d + z*num_local_groups;
-          double       * KRESTRICT phi_nm_z = phi_nm + z*num_groups + group0;
-          
-          for(int g = 0;g < num_local_groups; ++ g){  
-            phi_nm_z[g] += ell_nm_d * psi_d_z[g];
-          }
-        }
-      }
-    }
-  } 
-}
-
-void Kernel_3d_DZG::LPlusTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_groups = sdom.phi_out->groups;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gz = num_groups*num_zones;
-    int num_locgz = num_local_groups*num_zones;
-
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr() + group0;
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-    for (int d = 0; d < num_local_directions; d++) {
-      double       * KRESTRICT rhs_d = rhs + d*num_locgz;
-      double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-
-      for(int nm = 0;nm < num_moments;++nm){
-        double const             ell_plus_d_nm = ell_plus_d[nm];
-        double const * KRESTRICT phi_out_nm = phi_out + nm*num_gz;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-        for(int z = 0;z < num_zones;++ z){
-          double const * KRESTRICT phi_out_nm_z = phi_out_nm + z*num_groups;
-          double       * KRESTRICT rhs_d_z = rhs_d + z*num_local_groups;
-
-          for(int g = 0;g < num_local_groups;++ g){
-            rhs_d_z[g] += ell_plus_d_nm * phi_out_nm_z[g];
-          }
-        }
-      }
-    }
-  }
-}
-
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-
-*/
-void Kernel_3d_DZG::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr(); 
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_gz = num_groups*num_zones;
-
-    for(int nm = 0;nm < num_moments;++ nm){
-      // map nm to n
-      int n = moment_to_coeff[nm];
-      double const * KRESTRICT sigs_n = sigs + n*3*num_groups*num_groups;
-      double const * KRESTRICT phi_nm = phi + nm*num_gz;
-      double       * KRESTRICT phi_out_nm = phi_out + nm*num_gz;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-      for(int zone = 0;zone < num_zones;++ zone){
-        int mix_start = zones_to_mixed[zone];
-        int mix_stop = mix_start + num_mixed[zone];
-
-        for(int mix = mix_start;mix < mix_stop;++ mix){
-          int material = mixed_material[mix];
-          double fraction = mixed_fraction[mix];
-
-          double const * KRESTRICT sigs_n_mat = sigs_n + material*num_groups*num_groups;
-          double const * KRESTRICT phi_nm_z = phi_nm + zone*num_groups;
-          double       * KRESTRICT phi_out_nm_z = phi_out_nm + zone*num_groups;
-
-          for(int g = 0;g < num_groups;++ g){      
-            double const * KRESTRICT sigs_n_mat_g = sigs_n_mat + g*num_groups;
-            double const             phi_nm_z_g = phi_nm_z[g];
-                    
-            for(int gp = 0;gp < num_groups;++ gp){           
-              phi_out_nm_z[gp] += sigs_n_mat_g[gp] * phi_nm_z_g * fraction;
-            }
-          }        
-        }
-      }
-    }
-  }
-}
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_DZG::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get the phi and phi out references
-    SubTVec &phi_out = *grid_data->phi_out[zs];
-
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out_nm0 = phi_out.ptr();
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = phi_out.groups;
-    
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int zone = 0;zone < num_zones;++ zone){
-      int mix_start = zones_to_mixed[zone];
-      int mix_stop = mix_start + num_mixed[zone];
-
-      for(int mix = mix_start;mix < mix_stop;++ mix){
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-        double * KRESTRICT phi_out_nm0_z = phi_out_nm0 + zone*num_groups;
-
-        if(material == 0){
-          for(int g = 0;g < num_groups;++ g){
-            phi_out_nm0_z[g] += 1.0 * fraction;
-          }
-        }
-      }
-    }
-  }
-}
-
-
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-  
-void Kernel_3d_DZG::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-  int num_zones = sdom->num_zones;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-  int local_kmax = sdom->nzones[2];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-  
-  int num_zg = num_zones * num_groups;
-  int num_zg_i = local_jmax * local_kmax * num_groups;
-  int num_zg_j = local_imax * local_kmax * num_groups;
-  int num_zg_k = local_imax * local_jmax * num_groups;
-
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-  for (int d = 0; d < num_directions; ++d) {
-    double xcos = 2.0 * direction[d].xcos;
-    double ycos = 2.0 * direction[d].ycos;
-    double zcos = 2.0 * direction[d].zcos;
-
-    double       * KRESTRICT psi_d  = psi  + d*num_zg;
-    double const * KRESTRICT rhs_d  = rhs  + d*num_zg;
-
-    double       * KRESTRICT psi_lf_d = psi_lf + d*num_zg_i;
-    double       * KRESTRICT psi_fr_d = psi_fr + d*num_zg_j;
-    double       * KRESTRICT psi_bo_d = psi_bo + d*num_zg_k;
-
-    //  Perform transport sweep of the grid 1 cell at a time.
-    for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {
-      double zcos_dzk = zcos / dz[k + 1];
-      
-      for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-        double ycos_dyj = ycos / dy[j + 1];
-        
-        for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-          double xcos_dxi = xcos / dx[i + 1];
-
-          int z = Zonal_INDEX(i, j, k);
-          double const * KRESTRICT sigt_z = sigt + z*num_groups;
-          double       * KRESTRICT psi_d_z = psi_d + z*num_groups;
-          double const * KRESTRICT rhs_d_z = rhs_d + z*num_groups;
-
-          double * KRESTRICT psi_lf_d_z = psi_lf_d + I_PLANE_INDEX(j, k)*num_groups;
-          double * KRESTRICT psi_fr_d_z = psi_fr_d + J_PLANE_INDEX(i, k)*num_groups;
-          double * KRESTRICT psi_bo_d_z = psi_bo_d + K_PLANE_INDEX(i, j)*num_groups;
-
-          for (int g = 0; g < num_groups; ++g) {
-            // Calculate new zonal flux 
-            double psi_d_z_g = (rhs_d_z[g]
-                + psi_lf_d_z[g] * xcos_dxi
-                + psi_fr_d_z[g] * ycos_dyj
-                + psi_bo_d_z[g] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_z[g]);
-
-            psi_d_z[g] = psi_d_z_g;
-
-            // Apply diamond-difference relationships 
-            psi_lf_d_z[g] = 2.0 * psi_d_z_g - psi_lf_d_z[g];
-            psi_fr_d_z[g] = 2.0 * psi_d_z_g - psi_fr_d_z[g];
-            psi_bo_d_z[g] = 2.0 * psi_d_z_g - psi_bo_d_z[g];
-          }
-        }
-      }
-    }
-  }
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.h
deleted file mode 100644
index 4444e5ee6..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_DZG.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_DZG_H__
-#define KRIPKE_KERNEL_3D_DZG_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_DZG : public Kernel {
-  public:
-    virtual ~Kernel_3d_DZG();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.cpp
deleted file mode 100644
index 1b65f16bd..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_GDZ.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_GDZ::~Kernel_3d_GDZ() {}
-
-Nesting_Order Kernel_3d_GDZ::nestingPsi(void) const {
-  return NEST_GDZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingPhi(void) const {
-  return NEST_GDZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_GDZ::nestingSigs(void) const {
-  return NEST_GDZ;
-}
-
-
-void Kernel_3d_GDZ::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Clear phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_dz = num_zones*num_local_directions;
-    int num_nmz = num_zones*num_moments;
-
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for (int g = 0; g < num_local_groups; ++g) {
-      double const * KRESTRICT psi_g = psi + g*num_dz;
-      double       * KRESTRICT phi_g = phi + (group0+g)*num_nmz;
-
-      for(int nm = 0;nm < num_moments;++nm){
-        double const * KRESTRICT ell_nm = ell + nm*num_local_directions;
-        double       * KRESTRICT phi_g_nm = phi_g + nm*num_zones;
-
-        for (int d = 0; d < num_local_directions; d++) {
-          double const * KRESTRICT psi_g_d = psi_g + d*num_zones;
-          double const             ell_nm_d = ell_nm[d];
-
-          for(int z = 0;z < num_zones; ++ z){
-            phi_g_nm[z] += ell_nm_d * psi_g_d[z];
-          }
-        }
-      }
-    }
-  }
-}
-
-void Kernel_3d_GDZ::LPlusTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_nmz = num_moments*num_zones;
-    int num_dz = num_local_directions*num_zones;
-
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-    
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr();
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for (int g = 0; g < num_local_groups; ++g) {
-      double const * KRESTRICT phi_out_g = phi_out + (group0+g)*num_nmz;
-      double       * KRESTRICT rhs_g = rhs + g*num_dz;
-
-      for (int d = 0; d < num_local_directions; d++) {
-        double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-        double       * KRESTRICT rhs_g_d = rhs_g + d*num_zones;
-
-        for(int nm = 0;nm < num_moments;++nm){
-          double const * KRESTRICT phi_out_g_nm = phi_out_g + nm*num_zones;
-          double const             ell_plus_d_nm = ell_plus_d[nm];
-
-          for(int z = 0;z < num_zones; ++ z){
-            rhs_g_d[z] += ell_plus_d_nm * phi_out_g_nm[z];
-          }          
-        }        
-      }     
-    }
-  } 
-}
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-void Kernel_3d_GDZ::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr(); 
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_coeff = grid_data->legendre_order+1;
-    int num_nmz = num_moments*num_zones;
-
-    for(int g = 0;g < num_groups;++ g){      
-      double const * KRESTRICT sigs_g = sigs + g*num_groups*num_coeff*3;
-      double const * KRESTRICT phi_g = phi + g*num_nmz;
-                    
-      for(int gp = 0;gp < num_groups;++ gp){           
-        double const * KRESTRICT sigs_g_gp = sigs_g + gp*num_coeff*3;
-        double       * KRESTRICT phi_out_gp = phi_out + gp*num_nmz;
-
-        for(int nm = 0;nm < num_moments;++ nm){
-          // map nm to n
-          int n = moment_to_coeff[nm];
-
-          double const * KRESTRICT sigs_g_gp_n = sigs_g_gp + n*3;
-          double const * KRESTRICT phi_g_nm = phi_g + nm*num_zones;
-          double       * KRESTRICT phi_out_gp_nm = phi_out_gp + nm*num_zones;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-          for(int zone = 0;zone < num_zones;++ zone){
-            int mix_start = zones_to_mixed[zone];
-            int mix_stop = mix_start + num_mixed[zone];
-
-            for(int mix = mix_start;mix < mix_stop;++ mix){
-              int material = mixed_material[mix];
-              double fraction = mixed_fraction[mix];                
-                                                                          
-              phi_out_gp_nm[zone] += sigs_g_gp_n[material] * phi_g_nm[zone] * fraction;
-            }
-          }
-        }        
-      }
-    }
-  }
-}
-
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_GDZ::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-  
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT mixed_to_zones = &sdom.mixed_to_zones[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // grab dimensions
-    int num_mixed = sdom.mixed_to_zones.size();
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int g = 0;g < num_groups;++ g){
-      double * KRESTRICT phi_out_g_nm0 = phi_out + g*num_zones*num_moments;
-      
-      for(int mix = 0;mix < num_mixed;++ mix){
-        int zone = mixed_to_zones[mix];
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-
-        if(material == 0){
-          phi_out_g_nm0[zone] += 1.0 * fraction;
-        }
-      }
-    }
-  }
-}
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-  
-void Kernel_3d_GDZ::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-  int num_zones = sdom->num_zones;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-  int local_kmax = sdom->nzones[2];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-  
-  int num_dz = num_zones * num_directions;
-  int num_dz_i = local_jmax * local_kmax * num_directions;
-  int num_dz_j = local_imax * local_kmax * num_directions;
-  int num_dz_k = local_imax * local_jmax * num_directions;
-  int num_z_i = local_jmax * local_kmax;
-  int num_z_j = local_imax * local_kmax;
-  int num_z_k = local_imax * local_jmax;
-
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-  for (int g = 0; g < num_groups; ++g) {
-  
-    double const * KRESTRICT sigt_g = sigt + num_zones*g;
-    double       * KRESTRICT psi_g  = psi  + g*num_dz;
-    double const * KRESTRICT rhs_g  = rhs  + g*num_dz;
-
-    double       * KRESTRICT psi_lf_g = psi_lf + g*num_dz_i;
-    double       * KRESTRICT psi_fr_g = psi_fr + g*num_dz_j;
-    double       * KRESTRICT psi_bo_g = psi_bo + g*num_dz_k;
-
-    for (int d = 0; d < num_directions; ++d) {
-      double       * KRESTRICT psi_g_d = psi_g + d*num_zones;
-      double const * KRESTRICT rhs_g_d = rhs_g + d*num_zones;
-      double       * KRESTRICT psi_lf_g_d = psi_lf_g + d*num_z_i;
-      double       * KRESTRICT psi_fr_g_d = psi_fr_g + d*num_z_j;
-      double       * KRESTRICT psi_bo_g_d = psi_bo_g + d*num_z_k;
-
-      double xcos = 2.0 * direction[d].xcos;
-      double ycos = 2.0 * direction[d].ycos;
-      double zcos = 2.0 * direction[d].zcos;
-
-      //  Perform transport sweep of the grid 1 cell at a time.
-      for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {
-        double zcos_dzk = zcos / dz[k + 1];
-        
-        for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-          double ycos_dyj = ycos / dy[j + 1];
-                    
-          for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-            double xcos_dxi = xcos / dx[i + 1];
-            
-            int z_idx = Zonal_INDEX(i, j, k);
-            int z_i = I_PLANE_INDEX(j, k);
-            int z_j = J_PLANE_INDEX(i, k);
-            int z_k = K_PLANE_INDEX(i, j);
-
-            // Calculate new zonal flux
-            double psi_g_d_z = (rhs_g_d[z_idx]
-                + psi_lf_g_d[z_i] * xcos_dxi
-                + psi_fr_g_d[z_j] * ycos_dyj
-                + psi_bo_g_d[z_k] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_g[z_idx]);
-                
-            psi_g_d[z_idx] = psi_g_d_z;
-
-            // Apply diamond-difference relationships
-            psi_lf_g_d[z_i] = 2.0 * psi_g_d_z - psi_lf_g_d[z_i];
-            psi_fr_g_d[z_j] = 2.0 * psi_g_d_z - psi_fr_g_d[z_j];
-            psi_bo_g_d[z_k] = 2.0 * psi_g_d_z - psi_bo_g_d[z_k];
-          }
-        }
-      }
-    }
-  }
-
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.h
deleted file mode 100644
index 2dd8de0ec..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GDZ.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_GDZ_H__
-#define KRIPKE_KERNEL_3D_GDZ_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_GDZ : public Kernel {
-  public:
-    virtual ~Kernel_3d_GDZ();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.cpp
deleted file mode 100644
index 6f3208394..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_GZD.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_GZD::~Kernel_3d_GZD() {}
-
-Nesting_Order Kernel_3d_GZD::nestingPsi(void) const {
-  return NEST_GZD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingPhi(void) const {
-  return NEST_GZD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingSigt(void) const {
-  return NEST_DGZ;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_GZD::nestingSigs(void) const {
-  return NEST_GZD;
-}
-
-
-void Kernel_3d_GZD::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Clear phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
- // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_dz = num_zones*num_local_directions;
-    int num_nmz = num_zones*num_moments;
-
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for (int g = 0; g < num_local_groups; ++g) {
-      double const * KRESTRICT psi_g = psi + g*num_dz;
-      double       * KRESTRICT phi_g = phi + (group0+g)*num_nmz;
-
-      for(int z = 0;z < num_zones; ++ z){
-        double const * KRESTRICT psi_g_z = psi_g + z*num_local_directions;
-        double       * KRESTRICT phi_g_z = phi_g + z*num_moments;
-
-        for(int nm = 0;nm < num_moments;++nm){
-          double const * KRESTRICT ell_nm = ell + nm*num_local_directions;
-
-          double phi_g_z_nm = 0.0;
-          for (int d = 0; d < num_local_directions; d++) {
-            phi_g_z_nm += ell_nm[d] * psi_g_z[d];
-          }
-          phi_g_z[nm] += phi_g_z_nm;
-        }
-      }
-    }
-  }
-}
-
-void Kernel_3d_GZD::LPlusTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_nmz = num_moments*num_zones;
-    int num_dz = num_local_directions*num_zones;
-
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-    
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr();
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-    for (int g = 0; g < num_local_groups; ++g) {
-      double const * KRESTRICT phi_out_g = phi_out + (group0+g)*num_nmz;
-      double       * KRESTRICT rhs_g = rhs + g*num_dz;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-      for(int z = 0;z < num_zones; ++ z){
-        double const * KRESTRICT phi_out_g_z = phi_out_g + z*num_moments;
-        double       * KRESTRICT rhs_g_z = rhs_g + z*num_local_directions;
-
-        for (int d = 0; d < num_local_directions; d++) {
-          double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-
-          double rhs_g_z_d = 0.0;
-          for(int nm = 0;nm < num_moments;++nm){            
-            rhs_g_z_d += ell_plus_d[nm] * phi_out_g_z[nm];
-          }          
-          rhs_g_z[d] += rhs_g_z_d;
-        }        
-      }     
-    }
-  } 
-}
-
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-void Kernel_3d_GZD::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr();
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_coeff = grid_data->legendre_order+1;
-    int num_nmz = num_moments*num_zones;
-
-    for(int g = 0;g < num_groups;++ g){      
-      double const * KRESTRICT sigs_g = sigs + g*num_groups*num_coeff*3;
-      double const * KRESTRICT phi_g = phi + g*num_nmz;
-                    
-      for(int gp = 0;gp < num_groups;++ gp){           
-        double const * KRESTRICT sigs_g_gp = sigs_g + gp*num_coeff*3;
-        double       * KRESTRICT phi_out_gp = phi_out + gp*num_nmz;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-        for(int zone = 0;zone < num_zones;++ zone){
-          int mix_start = zones_to_mixed[zone];
-          int mix_stop = mix_start + num_mixed[zone];
-
-          for(int mix = mix_start;mix < mix_stop;++ mix){
-            int material = mixed_material[mix];
-            double fraction = mixed_fraction[mix];
-            
-            double const * KRESTRICT sigs_g_gp_mat = sigs_g_gp + material*num_coeff;
-            double const * KRESTRICT phi_g_z = phi_g + zone*num_moments;
-            double       * KRESTRICT phi_out_gp_z = phi_out_gp + zone*num_moments;
-      
-            for(int nm = 0;nm < num_moments;++ nm){
-              // map nm to n
-              int n = moment_to_coeff[nm];
-                                                                          
-              phi_out_gp_z[nm] += sigs_g_gp_mat[n] * phi_g_z[nm] * fraction;
-            }
-          }        
-        }
-      }
-    }
-  }
-}
-
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_GZD::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-  
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT mixed_to_zones = &sdom.mixed_to_zones[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // grab dimensions
-    int num_mixed = sdom.mixed_to_zones.size();
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int g = 0;g < num_groups;++ g){
-      double * KRESTRICT phi_out_g_nm0 = phi_out + g*num_zones*num_moments;
-      
-      for(int mix = 0;mix < num_mixed;++ mix){
-        int zone = mixed_to_zones[mix];
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-
-        if(material == 0){
-          phi_out_g_nm0[zone*num_moments] += 1.0 * fraction;
-        }
-      }
-    }
-  }
-}
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-  
-void Kernel_3d_GZD::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-  int num_zones = sdom->num_zones;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-  int local_kmax = sdom->nzones[2];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-  
-  int num_zd = num_zones * num_directions;
-  int num_zd_i = local_jmax * local_kmax * num_directions;
-  int num_zd_j = local_imax * local_kmax * num_directions;
-  int num_zd_k = local_imax * local_jmax * num_directions;
-  
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-  for (int g = 0; g < num_groups; ++g) {
-    double const * KRESTRICT sigt_g = sigt + num_zones*g;
-    double       * KRESTRICT psi_g  = psi  + g*num_zd;
-    double const * KRESTRICT rhs_g  = rhs  + g*num_zd;
-
-    double       * KRESTRICT psi_lf_g = psi_lf + g*num_zd_i;
-    double       * KRESTRICT psi_fr_g = psi_fr + g*num_zd_j;
-    double       * KRESTRICT psi_bo_g = psi_bo + g*num_zd_k;
-
-    //  Perform transport sweep of the grid 1 cell at a time.   
-    for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {
-      double two_dz = 2.0 / dz[k + 1];
-      for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-        double two_dy = 2.0 / dy[j + 1];
-        for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-          double two_dx = 2.0 / dx[i + 1];
-
-          int z = Zonal_INDEX(i, j, k);
-
-          double const sigt_g_z = sigt_g[z];
-          double       * KRESTRICT psi_g_z = psi_g + z*num_directions; 
-          double const * KRESTRICT rhs_g_z = rhs_g + z*num_directions;
-
-          double * KRESTRICT psi_lf_g_z = psi_lf_g + I_PLANE_INDEX(j, k)*num_directions;
-          double * KRESTRICT psi_fr_g_z = psi_fr_g + J_PLANE_INDEX(i, k)*num_directions;
-          double * KRESTRICT psi_bo_g_z = psi_bo_g + K_PLANE_INDEX(i, j)*num_directions;
-
-          for (int d = 0; d < num_directions; ++d) {            
-            double xcos_dxi = direction[d].xcos * two_dx;
-            double ycos_dyj = direction[d].ycos * two_dy;
-            double zcos_dzk = direction[d].zcos * two_dz;
-                       
-            // Calculate new zonal flux 
-            double psi_g_z_d = (rhs_g_z[d] + psi_lf_g_z[d] * xcos_dxi
-                + psi_fr_g_z[d] * ycos_dyj + psi_bo_g_z[d] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_g_z);
-
-            psi_g_z[d] = psi_g_z_d;
-
-            // Apply diamond-difference relationships 
-            psi_lf_g_z[d] = 2.0 * psi_g_z_d - psi_lf_g_z[d];
-            psi_fr_g_z[d] = 2.0 * psi_g_z_d - psi_fr_g_z[d];
-            psi_bo_g_z[d] = 2.0 * psi_g_z_d - psi_bo_g_z[d];
-          }
-        }
-      }
-    }
-  } 
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.h
deleted file mode 100644
index 242f3fa43..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_GZD.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_GZD_H__
-#define KRIPKE_KERNEL_3D_GZD_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_GZD : public Kernel {
-  public:
-    virtual ~Kernel_3d_GZD();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.cpp
deleted file mode 100644
index 023b5e54c..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.cpp
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_ZDG.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_ZDG::~Kernel_3d_ZDG() {}
-
-Nesting_Order Kernel_3d_ZDG::nestingPsi(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingPhi(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZDG::nestingSigs(void) const {
-  return NEST_ZDG;
-}
-
-
-void Kernel_3d_ZDG::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Clear phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_groups = sdom.phi->groups;
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gnm = num_groups * num_moments;
-    int num_locgd = num_local_groups * num_local_directions;
-
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for (int z = 0; z < num_zones; z++) {
-      double const * KRESTRICT psi_z = psi + z*num_locgd;
-      double       * KRESTRICT phi_z = phi + z*num_gnm;
-
-      for(int nm = 0;nm < num_moments;++nm){
-        double const * KRESTRICT ell_nm = ell + nm*num_local_directions;
-        double       * KRESTRICT phi_z_nm_g0 = phi_z + nm*num_groups + group0;
-
-        for (int d = 0; d < num_local_directions; d++) {
-          double const             ell_nm_d = ell_nm[d];
-          double const * KRESTRICT psi_z_d = psi_z + d*num_local_groups;
-
-          for (int g = 0; g < num_local_groups; ++g) {
-            phi_z_nm_g0[g] += ell_nm_d * psi_z_d[g];
-          }
-        }
-      }
-    }
-  }
-}
-
-void Kernel_3d_ZDG::LPlusTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_groups = sdom.phi->groups;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gnm = num_moments*num_groups;
-    int num_locgd = num_local_directions*num_local_groups;
-
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-    
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr();
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int z = 0;z < num_zones; ++ z){    
-      double const * KRESTRICT phi_out_z = phi_out + z*num_gnm;
-      double       * KRESTRICT rhs_z = rhs + z*num_locgd;
-      
-      for (int d = 0; d < num_local_directions; d++) {
-        double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-        double       * KRESTRICT rhs_z_d = rhs_z + d*num_local_groups;
-        
-        for(int nm = 0;nm < num_moments;++nm){
-          double const * KRESTRICT phi_out_z_nm = phi_out_z + nm*num_groups + group0;
-          double const             ell_plus_d_nm = ell_plus_d[nm];
-          
-          for (int g = 0; g < num_local_groups; ++g) {
-            rhs_z_d[g] += ell_plus_d_nm * phi_out_z_nm[g];
-          }                    
-        }        
-      }     
-    }
-  }
-}
-
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-void Kernel_3d_ZDG::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr();
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_coeff = grid_data->legendre_order+1;
-    int num_nmg = num_moments*num_groups;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int zone = 0;zone < num_zones;++ zone){
-      int mix_start = zones_to_mixed[zone];
-      int mix_stop = mix_start + num_mixed[zone];
-
-      for(int mix = mix_start;mix < mix_stop;++ mix){
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-        
-        double const * KRESTRICT sigs_mat = sigs + material*num_coeff*num_groups*num_groups;
-        double const * KRESTRICT phi_z = phi + zone*num_nmg;
-        double       * KRESTRICT phi_out_z = phi_out + zone*num_nmg;
-
-        for(int nm = 0;nm < num_moments;++ nm){
-          // map nm to n
-          int n = moment_to_coeff[nm];
-          
-          double const * KRESTRICT sigs_mat_n = sigs_mat + n*num_groups*num_groups;
-          double const * KRESTRICT phi_z_nm = phi_z + nm*num_groups;
-          double       * KRESTRICT phi_out_z_nm = phi_out_z + nm*num_groups;
-
-          for(int g = 0;g < num_groups;++ g){      
-            double const * KRESTRICT sigs_mat_n_g = sigs_mat_n + g*num_groups;
-            double const             phi_z_nm_g = phi_z_nm[g];
-                              
-            for(int gp = 0;gp < num_groups;++ gp){
-              phi_out_z_nm[gp] += sigs_mat_n_g[gp] * phi_z_nm_g * fraction;
-            }
-          }        
-        }
-      }
-    }
-  }
-}
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_ZDG::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-  
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_to_zones = &sdom.mixed_to_zones[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int zone = 0;zone < num_zones;++ zone){
-      int mix_start = zones_to_mixed[zone];
-      int mix_stop = mix_start + num_mixed[zone];
-
-      for(int mix = mix_start;mix < mix_stop;++ mix){
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];      
-        double * KRESTRICT phi_out_z_nm0 = phi_out + zone*num_moments*num_groups;
-
-        if(material == 0){        
-          for(int g = 0;g < num_groups;++ g){
-            phi_out_z_nm0[g] += 1.0 * fraction;
-          }
-        }
-      }
-    }
-  }
-}
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-
-void Kernel_3d_ZDG::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-  
-  int num_gd = num_groups * num_directions;
-
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-  for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {
-    double two_dz = 2.0 / dz[k + 1];
-    for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-      double two_dy = 2.0 / dy[j + 1];
-      for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-        double two_dx = 2.0 / dx[i + 1];
-
-        int z = Zonal_INDEX(i, j, k);
-        
-        double const * KRESTRICT sigt_z = sigt + z*num_groups;
-        double       * KRESTRICT psi_z  = psi  + z*num_gd;
-        double const * KRESTRICT rhs_z  = rhs  + z*num_gd;
-
-        double * KRESTRICT psi_lf_z = psi_lf + I_PLANE_INDEX(j, k) * num_gd;
-        double * KRESTRICT psi_fr_z = psi_fr + J_PLANE_INDEX(i, k) * num_gd;
-        double * KRESTRICT psi_bo_z = psi_bo + K_PLANE_INDEX(i, j) * num_gd;
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-        for (int d = 0; d < num_directions; ++d) {
-        
-          double xcos_dxi = two_dx * direction[d].xcos;
-          double ycos_dyj = two_dy * direction[d].ycos;          
-          double zcos_dzk = two_dz * direction[d].zcos;
-
-          double       * KRESTRICT psi_z_d = psi_z + d*num_groups;
-          double const * KRESTRICT rhs_z_d = rhs_z + d*num_groups;
-
-          double * KRESTRICT psi_lf_z_d = psi_lf_z + d*num_groups;
-          double * KRESTRICT psi_fr_z_d = psi_fr_z + d*num_groups;
-          double * KRESTRICT psi_bo_z_d = psi_bo_z + d*num_groups;
-
-          for (int g = 0; g < num_groups; ++g) {
-            // Calculate new zonal flux 
-            double psi_z_d_g = (rhs_z_d[g]
-                + psi_lf_z_d[g] * xcos_dxi
-                + psi_fr_z_d[g] * ycos_dyj
-                + psi_bo_z_d[g] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_z[g]);
-
-            psi_z_d[g] = psi_z_d_g;
-
-            // Apply diamond-difference relationships 
-            psi_lf_z_d[g] = 2.0 * psi_z_d_g - psi_lf_z_d[g];
-            psi_fr_z_d[g] = 2.0 * psi_z_d_g - psi_fr_z_d[g];
-            psi_bo_z_d[g] = 2.0 * psi_z_d_g - psi_bo_z_d[g];
-          }
-        }
-      }
-    }
-  }
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.h
deleted file mode 100644
index 3f9626b18..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZDG.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_ZDG_H__
-#define KRIPKE_KERNEL_3D_ZDG_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_ZDG : public Kernel {
-  public:
-    virtual ~Kernel_3d_ZDG();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.cpp
deleted file mode 100644
index 5dd9162a9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.cpp
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Kernel/Kernel_3d_ZGD.h>
-#include<Kripke/Grid.h>
-#include<Kripke/SubTVec.h>
-
-Kernel_3d_ZGD::~Kernel_3d_ZGD() {}
-
-Nesting_Order Kernel_3d_ZGD::nestingPsi(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingPhi(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingSigt(void) const {
-  return NEST_DZG;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingEll(void) const {
-  return NEST_ZGD;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingEllPlus(void) const {
-  return NEST_ZDG;
-}
-
-Nesting_Order Kernel_3d_ZGD::nestingSigs(void) const {
-  return NEST_ZGD;
-}
-
-void Kernel_3d_ZGD::LTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Clear phi
-  for(int ds = 0;ds < grid_data->num_zone_sets;++ ds){
-    grid_data->phi[ds]->clear(0.0);
-  }
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_groups = sdom.phi->groups;
-    int num_zones = sdom.num_zones;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gnm = num_groups * num_moments;
-    int num_locgd = num_local_groups * num_local_directions;
-
-    // Get pointers
-    double const * KRESTRICT ell = sdom.ell->ptr();
-    double const * KRESTRICT psi = sdom.psi->ptr();
-    double       * KRESTRICT phi = sdom.phi->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for (int z = 0; z < num_zones; z++) {
-      double const * KRESTRICT psi_z = psi + z*num_locgd;
-      double       * KRESTRICT phi_z = phi + z*num_gnm;
-
-      for (int g = 0; g < num_local_groups; ++g) {
-        double const * KRESTRICT psi_z_g = psi_z + g*num_local_directions;
-        double       * KRESTRICT phi_z_g = phi_z + (group0+g)*num_moments;
-
-        for(int nm = 0;nm < num_moments;++nm){
-          double const * KRESTRICT ell_nm = ell + nm*num_local_directions;
-
-          double phi_z_g_nm = 0.0;
-          for (int d = 0; d < num_local_directions; d++) {
-            phi_z_g_nm += ell_nm[d] * psi_z_g[d];
-          }
-          phi_z_g[nm] += phi_z_g_nm;
-        }
-      }
-    }
-  }
-}
-
-void Kernel_3d_ZGD::LPlusTimes(Grid_Data *grid_data) {
-  // Outer parameters
-  int num_moments = grid_data->total_num_moments;
-
-  // Loop over Subdomains
-  int num_subdomains = grid_data->subdomains.size();
-  for (int sdom_id = 0; sdom_id < num_subdomains; ++ sdom_id){
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-
-    // Get dimensioning
-    int num_zones = sdom.num_zones;
-    int num_groups = sdom.phi->groups;
-    int num_local_groups = sdom.num_groups;
-    int group0 = sdom.group0;
-    int num_local_directions = sdom.num_directions;
-    int num_gnm = num_moments*num_groups;
-    int num_locgd = num_local_directions*num_local_groups;
-
-    // Zero RHS
-    sdom.rhs->clear(0.0);
-    
-    // Get pointers
-    double const * KRESTRICT phi_out = sdom.phi_out->ptr();
-    double const * KRESTRICT ell_plus = sdom.ell_plus->ptr();
-    double       * KRESTRICT rhs = sdom.rhs->ptr();
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int z = 0;z < num_zones; ++ z){    
-      double const * KRESTRICT phi_out_z = phi_out + z*num_gnm;
-      double       * KRESTRICT rhs_z = rhs + z*num_locgd;
-      
-      for (int g = 0; g < num_local_groups; ++g) {
-        double const * KRESTRICT phi_out_z_g = phi_out_z + (group0+g)*num_moments;
-        double       * KRESTRICT rhs_z_g = rhs_z + g*num_local_directions;
-      
-        for (int d = 0; d < num_local_directions; d++) {
-          double const * KRESTRICT ell_plus_d = ell_plus + d*num_moments;
-                              
-          double rhs_z_g_d = 0.0;
-          for(int nm = 0;nm < num_moments; ++nm){
-            rhs_z_g_d += ell_plus_d[nm] * phi_out_z_g[nm];
-          }                    
-          rhs_z_g[d] = rhs_z_g_d;
-        }        
-      }     
-    }
-  }
-}
-
-
-/**
-  Compute scattering source term phi_out from flux moments in phi.
-  phi_out(gp,z,nm) = sum_g { sigs(g, n, gp) * phi(g,z,nm) }
-*/
-void Kernel_3d_ZGD::scattering(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double const * KRESTRICT sigs = grid_data->sigs->ptr();
-
-    int    const * KRESTRICT moment_to_coeff = &grid_data->moment_to_coeff[0];
-    double const * KRESTRICT phi = grid_data->phi[zs]->ptr();
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // Zero out source terms
-    grid_data->phi_out[zs]->clear(0.0);
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-    int num_coeff = grid_data->legendre_order+1;
-    int num_nmg = num_moments*num_groups;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int zone = 0;zone < num_zones;++ zone){
-      int mix_start = zones_to_mixed[zone];
-      int mix_stop = mix_start + num_mixed[zone];
-
-      for(int mix = mix_start;mix < mix_stop;++ mix){
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-        
-        double const * KRESTRICT sigs_mat = sigs + material*num_coeff*num_groups*num_groups;
-        double const * KRESTRICT phi_z = phi + zone*num_nmg;
-        double       * KRESTRICT phi_out_z = phi_out + zone*num_nmg;
-        
-        for(int g = 0;g < num_groups;++ g){      
-          double const * KRESTRICT sigs_mat_g = sigs_mat + g*num_groups*num_coeff;
-          double const * KRESTRICT phi_z_g = phi_z + g*num_moments;
-                            
-          for(int gp = 0;gp < num_groups;++ gp){
-            double const * KRESTRICT sigs_mat_g_gp = sigs_mat_g + gp*num_coeff;
-            double       * KRESTRICT phi_out_z_gp = phi_out_z + gp*num_moments;
-          
-            for(int nm = 0;nm < num_moments;++ nm){
-              // map nm to n
-              int n = moment_to_coeff[nm];
-              
-              phi_out_z_gp[nm] += sigs_mat_g_gp[n] * phi_z_g[nm] * fraction;
-            }
-          }        
-        }
-      }
-    }
-  }
-}
-
-/**
- * Add an isotropic source, with flux of 1, to every zone with Region 1
- * (or material 0).
- *
- * Since it's isotropic, we're just adding this to nm=0.
- */
-void Kernel_3d_ZGD::source(Grid_Data *grid_data){
-  // Loop over zoneset subdomains
-  for(int zs = 0;zs < grid_data->num_zone_sets;++ zs){
-  
-    // get material mix information
-    int sdom_id = grid_data->zs_to_sdomid[zs];
-    Subdomain &sdom = grid_data->subdomains[sdom_id];
-    int    const * KRESTRICT zones_to_mixed = &sdom.zones_to_mixed[0];
-    int    const * KRESTRICT num_mixed = &sdom.num_mixed[0];
-    int    const * KRESTRICT mixed_material = &sdom.mixed_material[0];
-    double const * KRESTRICT mixed_fraction = &sdom.mixed_fraction[0];
-    double       * KRESTRICT phi_out = grid_data->phi_out[zs]->ptr();
-
-    // grab dimensions
-    int num_zones = sdom.num_zones;
-    int num_groups = grid_data->phi_out[zs]->groups;
-    int num_moments = grid_data->total_num_moments;
-
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int zone = 0;zone < num_zones;++ zone){
-      int mix_start = zones_to_mixed[zone];
-      int mix_stop = mix_start + num_mixed[zone];
-
-      for(int mix = mix_start;mix < mix_stop;++ mix){
-        int material = mixed_material[mix];
-        double fraction = mixed_fraction[mix];
-        double * KRESTRICT phi_out_z = phi_out + zone*num_moments*num_groups;
-
-        if(material == 0){        
-          for(int g = 0;g < num_groups;++ g){
-            phi_out_z[g*num_moments] += 1.0 * fraction;
-          }
-        }
-      }
-    }
-  }
-}
-
-// Macros for offsets with fluxes on cell faces 
-#define I_PLANE_INDEX(j, k) ((k)*(local_jmax) + (j))
-#define J_PLANE_INDEX(i, k) ((k)*(local_imax) + (i))
-#define K_PLANE_INDEX(i, j) ((j)*(local_imax) + (i))
-#define Zonal_INDEX(i, j, k) ((i) + (local_imax)*(j) \
-  + (local_imax)*(local_jmax)*(k))
-
-void Kernel_3d_ZGD::sweep(Subdomain *sdom) {
-  int num_directions = sdom->num_directions;
-  int num_groups = sdom->num_groups;
-
-  Directions *direction = sdom->directions;
-
-  int local_imax = sdom->nzones[0];
-  int local_jmax = sdom->nzones[1];
-
-  double const * KRESTRICT dx = &sdom->deltas[0][0];
-  double const * KRESTRICT dy = &sdom->deltas[1][0];
-  double const * KRESTRICT dz = &sdom->deltas[2][0];
-  
-  double const * KRESTRICT sigt = sdom->sigt->ptr();
-  double       * KRESTRICT psi  = sdom->psi->ptr();
-  double const * KRESTRICT rhs  = sdom->rhs->ptr();
-
-  double * KRESTRICT psi_lf = sdom->plane_data[0]->ptr();
-  double * KRESTRICT psi_fr = sdom->plane_data[1]->ptr();
-  double * KRESTRICT psi_bo = sdom->plane_data[2]->ptr();
-
-  int num_gd = num_groups * num_directions;
-
-  // All directions have same id,jd,kd, since these are all one Direction Set
-  // So pull that information out now
-  Grid_Sweep_Block const &extent = sdom->sweep_block;
-
-  //  Perform transport sweep of the grid 1 cell at a time.
-  for (int k = extent.start_k; k != extent.end_k; k += extent.inc_k) {
-    double two_dz = 2.0 / dz[k + 1];
-    for (int j = extent.start_j; j != extent.end_j; j += extent.inc_j) {
-      double two_dy = 2.0 / dy[j + 1];
-      for (int i = extent.start_i; i != extent.end_i; i += extent.inc_i) {
-        double two_dx = 2.0 / dx[i + 1];
-
-        int z = Zonal_INDEX(i, j, k);
-        double const * KRESTRICT sigt_z = sigt + z*num_groups;
-        double       * KRESTRICT psi_z  = psi  + z*num_gd;
-        double const * KRESTRICT rhs_z  = rhs  + z*num_gd;
-
-        double       * KRESTRICT psi_lf_z = psi_lf + I_PLANE_INDEX(j, k) * num_gd;
-        double       * KRESTRICT psi_fr_z = psi_fr + J_PLANE_INDEX(i, k) * num_gd;
-        double       * KRESTRICT psi_bo_z = psi_bo + K_PLANE_INDEX(i, j) * num_gd;
-        
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-        for (int g = 0; g < num_groups; ++g) {
-          double       * KRESTRICT psi_z_g = psi_z + g * num_directions;
-          double const * KRESTRICT rhs_z_g = rhs_z + g * num_directions;
-
-          double       * KRESTRICT psi_lf_z_g = psi_lf_z + g * num_directions;
-          double       * KRESTRICT psi_fr_z_g = psi_fr_z + g * num_directions;
-          double       * KRESTRICT psi_bo_z_g = psi_bo_z + g * num_directions;
-
-          for (int d = 0; d < num_directions; ++d) {
-            double xcos_dxi = direction[d].xcos * two_dx;
-            double ycos_dyj = direction[d].ycos * two_dy;
-            double zcos_dzk = direction[d].zcos * two_dz;
-
-            // Calculate new zonal flux 
-            double psi_z_g_d = (rhs_z_g[d]
-                + psi_lf_z_g[d] * xcos_dxi
-                + psi_fr_z_g[d] * ycos_dyj
-                + psi_bo_z_g[d] * zcos_dzk)
-                / (xcos_dxi + ycos_dyj + zcos_dzk + sigt_z[g]);
-
-            psi_z_g[d] = psi_z_g_d;
-
-            // Apply diamond-difference relationships 
-            psi_lf_z_g[d] = 2.0 * psi_z_g_d - psi_lf_z_g[d];
-            psi_fr_z_g[d] = 2.0 * psi_z_g_d - psi_fr_z_g[d];
-            psi_bo_z_g[d] = 2.0 * psi_z_g_d - psi_bo_z_g[d];
-          }
-        }
-      }
-    }
-  }
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.h
deleted file mode 100644
index b49950a86..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Kernel/Kernel_3d_ZGD.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_KERNEL_3D_ZGD_H__
-#define KRIPKE_KERNEL_3D_ZGD_H__
-
-#include<Kripke/Kernel.h>
-
-class Kernel_3d_ZGD : public Kernel {
-  public:
-    virtual ~Kernel_3d_ZGD();
-    virtual Nesting_Order nestingPsi(void) const;
-    virtual Nesting_Order nestingPhi(void) const;
-    virtual Nesting_Order nestingSigt(void) const;
-    virtual Nesting_Order nestingEll(void) const;
-    virtual Nesting_Order nestingEllPlus(void) const;
-    virtual Nesting_Order nestingSigs(void) const;
-
-    virtual void LTimes(Grid_Data *grid_data);
-    virtual void LPlusTimes(Grid_Data *grid_data);
-    virtual void scattering(Grid_Data *grid_data);
-    virtual void source(Grid_Data *grid_data);
-    virtual void sweep(Subdomain *ga_set);
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.cpp
deleted file mode 100644
index b0176393b..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include<Kripke/Layout.h>
-
-#include<Kripke/Input_Variables.h>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-namespace {
-  /*
-    The following 2 routines are used to map:
-      1) mpi ranks to/from processors in x,y,z
-      2) zoneset ids to/from zoneset in x,y,z
-  */
-
-  /**
-    Helper routine to take an index, and return a 3-dimensional set of indices,
-    given size of each index dimension.
-  */
-  inline void rankToIndices(int rank, int *indices, int const *sizes){
-    indices[0] = rank / (sizes[1]*sizes[2]);
-    rank = rank % (sizes[1]*sizes[2]);
-    indices[1] = rank / sizes[2];
-    indices[2] = rank % sizes[2];
-  }
-
-  /**
-    Helper routine to take an index, and return a 3-dimensional set of indices,
-    given size of each index dimension.
-  */
-  inline int indicesToRank(int const *indices, int const *sizes){
-    int rank;
-
-    rank =  indices[0]*(sizes[1]*sizes[2]);
-    rank += indices[1]*sizes[2];
-    rank += indices[2];
-
-    return rank;
-  }
-}
-
-Layout::Layout(Input_Variables *input_vars){
-  num_group_sets = input_vars->num_groupsets;
-  num_direction_sets = input_vars->num_dirsets;
-  num_zone_sets = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    num_zone_sets_dim[dim] = input_vars->num_zonesets_dim[dim];
-    num_zone_sets *= input_vars->num_zonesets_dim[dim];
-  }
-
-  // grab total number of zones
-  total_zones[0] = input_vars->nx;
-  total_zones[1] = input_vars->ny;
-  total_zones[2] = input_vars->nz;
-
-  // Grab size of processor grid
-  num_procs[0] = input_vars->npx;
-  num_procs[1] = input_vars->npy;
-  num_procs[2] = input_vars->npz;
-
-  /* Set the requested processor grid size */
-  int R = num_procs[0] * num_procs[1] * num_procs[2];
-
-  /* Check requested size is the same as MPI_COMM_WORLD */
-  int size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-#endif
-  if(R != size){
-    int myid=0;
-#ifdef KRIPKE_USE_MPI
-    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-    if(myid == 0){
-      KripkeAbort("ERROR: Incorrect number of MPI tasks. Need %d MPI tasks.", R);
-    }
-  }
-
-  /* Compute the local coordinates in the processor decomposition */
-  int mpi_rank = 0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-#endif
-  rankToIndices(mpi_rank, our_rank, num_procs);
-}
-Layout::~Layout(){
-
-}
-
-/**
-  Computes the subdomain ID based on a given groupset, directionset, and zoneset.
-*/
-int Layout::setIdToSubdomainId(int gs, int ds, int zs) const{
-  int indices[3] = {gs, ds, zs};
-  int sizes[3] = {num_group_sets, num_direction_sets, num_zone_sets};
-
-  return indicesToRank(indices, sizes);
-}
-
-/**
-  Computes groupset, directionset, and zoneset from a subdomain ID.
-*/
-void Layout::subdomainIdToSetId(int sdom_id, int &gs, int &ds, int &zs) const {
-  int indices[3];
-  int sizes[3] = {num_group_sets, num_direction_sets, num_zone_sets};
-
-  rankToIndices(sdom_id, indices, sizes);
-
-  gs = indices[0];
-  ds = indices[1];
-  zs = indices[2];
-}
-
-/**
-  Computes the zoneset id along a particular dimension.
-*/
-int Layout::subdomainIdToZoneSetDim(int sdom_id, int dim) const{
-  // Compute zoneset
-  int gs, ds, zs;
-  subdomainIdToSetId(sdom_id, gs, ds, zs);
-
-  // Compute zone set
-  int zs_dim[3];
-  rankToIndices(zs, zs_dim, num_zone_sets_dim);
-
-  return zs_dim[dim];
-}
-
-/**
-  Computes the number of zones in this subdomain, along specified dimension.
-*/
-int Layout::getNumZones(int sdom_id, int dim) const{
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  int total_subdomains = num_procs[dim] * num_zone_sets_dim[dim];
-  int global_subdomain  = num_zone_sets_dim[dim] * our_rank[dim] + zs_dim;
-
-  // Compute subset of global zone indices
-  int num_zones = total_zones[dim] / total_subdomains;
-  int rem = total_zones[dim] % total_subdomains;
-  if(rem != 0 && global_subdomain < rem){
-    num_zones ++;
-  }
-
-  return num_zones;
-}
-
-
-
-
-
-
-BlockLayout::BlockLayout(Input_Variables *input_vars) :
-  Layout(input_vars)
-{
-
-}
-BlockLayout::~BlockLayout(){
-
-}
-
-Neighbor BlockLayout::getNeighbor(int our_sdom_id, int dim, int dir) const{
-  Neighbor n;
-
-  // get our processor indices, so we can find neighbors
-  int proc[3] = {our_rank[0], our_rank[1], our_rank[2]};
-
-  int gs, ds, zs;
-  subdomainIdToSetId(our_sdom_id, gs, ds, zs);
-
-  // Compute out spatial subdomain indices
-  int zs_dim[3];
-  for(int d = 0;d < 3;++ d){
-    zs_dim[d] = subdomainIdToZoneSetDim(our_sdom_id, d);
-  }
-
-  // Offest along dir,dim to get neighboring indices
-  zs_dim[dim] += dir;
-
-  // Check if the neighbor is remote, and wrap zoneset indices
-  if(zs_dim[dim] >= num_zone_sets_dim[dim]){
-    zs_dim[dim] = 0;
-    proc[dim] += dir;
-  }
-  else if(zs_dim[dim] < 0){
-    zs_dim[dim] = num_zone_sets_dim[dim]-1;
-    proc[dim] += dir;
-  }
-
-  // Compute the mpi rank of the neighbor
-  if(proc[dim] < 0 || proc[dim] >= num_procs[dim]){
-    // we hit a boundary condition
-    n.mpi_rank = -1;
-    n.subdomain_id = -1;
-  }
-  else{
-    // There is a neighbor, so compute its rank
-    n.mpi_rank = indicesToRank(proc, num_procs);
-
-    // Compute neighboring subdomain id
-    zs = indicesToRank(zs_dim, num_zone_sets_dim);
-    n.subdomain_id = setIdToSubdomainId(gs, ds, zs);
-  }
-
-  return n;
-}
-
-/**
-  Compute the spatial extents of a subdomain along a given dimension.
-*/
-std::pair<double, double> BlockLayout::getSpatialExtents(int sdom_id, int dim) const{
-
-  // Start with global problem dimensions
-  std::pair<double, double> ext_global(-60.0, 60.0);
-  if(dim == 1){
-    ext_global.first = -100.0;
-    ext_global.second = 100.0;
-  }
-
-  // Subdivide by number of processors in specified dimension
-  double dx = (ext_global.second - ext_global.first) / (double)num_procs[dim];
-  std::pair<double, double> ext_proc(
-    ext_global.first + dx*(double)our_rank[dim],
-    ext_global.first + dx*(double)(our_rank[dim] + 1)
-  );
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  // Subdivide by number of subdomains in specified dimension
-  double sdx = (ext_proc.second - ext_proc.first) / (double)num_zone_sets_dim[dim];
-  std::pair<double, double> ext_sdom(
-    ext_proc.first + sdx*(double)zs_dim,
-    ext_proc.first + sdx*(double)(zs_dim + 1)
-  );
-
-  return ext_sdom;
-}
-
-
-
-ScatterLayout::ScatterLayout(Input_Variables *input_vars) :
-  Layout(input_vars)
-{
-
-}
-ScatterLayout::~ScatterLayout(){
-
-}
-
-Neighbor ScatterLayout::getNeighbor(int our_sdom_id, int dim, int dir) const{
-  Neighbor n;
-
-  // get our processor indices, so we can find neighbors
-  int proc[3] = {our_rank[0], our_rank[1], our_rank[2]};
-
-  int gs, ds, zs;
-  subdomainIdToSetId(our_sdom_id, gs, ds, zs);
-
-  // Compute our spatial subdomain indices
-  int zs_dim[3];
-  for(int d = 0;d < 3;++ d){
-    zs_dim[d] = subdomainIdToZoneSetDim(our_sdom_id, d);
-  }
-
-  // Offest along dir,dim to get neighboring subdomain indices
-  proc[dim] += dir;
-
-  // Check if we wrapped mpi ranks, and should bump zoneset indices
-  if(proc[dim] >= num_procs[dim]){
-    proc[dim] = 0;
-    zs_dim[dim] += dir;
-  }
-  else if(proc[dim] < 0){
-    proc[dim] = num_procs[dim]-1;
-    zs_dim[dim] += dir;
-  }
-
-  // Compute zone set indices, and detect boundary condition
-  if(zs_dim[dim] < 0 || zs_dim[dim] >= num_zone_sets_dim[dim]){
-    // we hit a boundary condition
-    n.mpi_rank = -1;
-    n.subdomain_id = -1;
-
-  }
-  else{
-    // There is a neighbor, so compute its rank
-    n.mpi_rank = indicesToRank(proc, num_procs);
-
-    // Compute neighboring subdomain id
-    zs = indicesToRank(zs_dim, num_zone_sets_dim);
-    n.subdomain_id = setIdToSubdomainId(gs, ds, zs);
-  }
-
-
-  return n;
-}
-
-/**
-  Compute the spatial extents of a subdomain along a given dimension.
-*/
-std::pair<double, double> ScatterLayout::getSpatialExtents(int sdom_id, int dim) const{
-
-  // Start with global problem dimensions
-  std::pair<double, double> ext_global(-60.0, 60.0);
-  if(dim == 1){
-    ext_global.first = -100.0;
-    ext_global.second = 100.0;
-  }
-
-  // get the zoneset index along the specified dimension
-  int zs_dim = subdomainIdToZoneSetDim(sdom_id, dim);
-
-  // Subdivide by number of subdomains in specified dimension
-  double sdx = (ext_global.second - ext_global.first) / (double)num_zone_sets_dim[dim];
-  std::pair<double, double> ext_sdom(
-    ext_global.first + sdx*(double)zs_dim,
-    ext_global.first + sdx*(double)(zs_dim + 1)
-  );
-
-  // Subdivide by number of processors in specified dimension
-  double dx = (ext_sdom.second - ext_sdom.first) / (double)num_procs[dim];
-  std::pair<double, double> ext_proc(
-    ext_sdom.first + dx*(double)our_rank[dim],
-    ext_sdom.first + dx*(double)(our_rank[dim] + 1)
-  );
-
-
-  return ext_proc;
-}
-
-
-/**
-  Factory to create Layout object based on user defined inputs
-*/
-Layout *createLayout(Input_Variables *input_vars){
-  switch(input_vars->layout_pattern){
-    case 0:
-      return new BlockLayout(input_vars);
-    case 1:
-      return new ScatterLayout(input_vars);
-  }
-  KripkeAbort("Unknown Layout patter\n");
-  return NULL;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.h
deleted file mode 100644
index 1794c6970..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Layout.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_LAYOUT_H__
-#define KRIPKE_LAYOUT_H__
-
-#include<algorithm>
-
-// foreward decl
-struct Input_Variables;
-
-/**
-  Describes a neighboring Subdomain using both mpi-rank and subdomin id
-*/
-struct Neighbor{
-  int mpi_rank;     // Neighbors MPI rank, or -1 for boundary condition
-  int subdomain_id; // Subdomain ID of neighbor
-};
-
-
-
-/**
-   Describes relationships between MPI-ranks and subdomains.
-   This is an interface, allowing different layout schemes to be implemented as derived types.
- */
-class Layout {
-  public:
-    explicit Layout(Input_Variables *input_vars);
-    virtual ~Layout();
-
-    virtual int setIdToSubdomainId(int gs, int ds, int zs) const;
-    virtual int subdomainIdToZoneSetDim(int sdom_id, int dim) const;
-    virtual void subdomainIdToSetId(int sdom_id, int &gs, int &ds, int &zs) const;
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const = 0;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const = 0;
-    virtual int getNumZones(int sdom_id, int dim) const;
-
-  protected:
-    int num_group_sets;      // Number of group sets
-    int num_direction_sets;  // Number of direction sets
-    int num_zone_sets;       // Number of zone sets
-    int num_zone_sets_dim[3];// Number of zone sets in each dimension
-
-    int total_zones[3];      // Total number of zones in each dimension
-
-    int num_procs[3];        // Number of MPI ranks in each dimensions
-    int our_rank[3];         // Our mpi indices in xyz
-};
-
-class BlockLayout : public Layout {
-  public:
-    explicit BlockLayout(Input_Variables *input_vars);
-    virtual ~BlockLayout();
-
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const;
-};
-
-class ScatterLayout : public Layout {
-  public:
-    explicit ScatterLayout(Input_Variables *input_vars);
-    virtual ~ScatterLayout();
-
-    virtual Neighbor getNeighbor(int our_sdom_id, int dim, int dir) const;
-    virtual std::pair<double, double> getSpatialExtents(int sdom_id, int dim) const;
-};
-
-
-// Factory to create layout object
-Layout *createLayout(Input_Variables *input_vars);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.cpp
deleted file mode 100644
index dbc71af1a..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/ParallelComm.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/SubTVec.h>
-
-
-ParallelComm::ParallelComm(Grid_Data *grid_data_ptr) :
-  grid_data(grid_data_ptr)
-{
-
-}
-
-ParallelComm::~ParallelComm(){
-
-}
-
-int ParallelComm::computeTag(int mpi_rank, int sdom_id){
-  int mpi_size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  int tag = mpi_rank + mpi_size*sdom_id;
-
-  return tag;
-}
-
-void ParallelComm::computeRankSdom(int tag, int &mpi_rank, int &sdom_id){
-  int mpi_size=1;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  mpi_rank = tag % mpi_size;
-  sdom_id = tag / mpi_size;
-}
-
-/**
-  Finds subdomain in the queue by its subdomain id.
-*/
-int ParallelComm::findSubdomain(int sdom_id){
-
-  // find subdomain in queue
-  int index;
-  for(index = 0;index < queue_sdom_ids.size();++ index){
-    if(queue_sdom_ids[index] == sdom_id){
-      break;
-    }
-  }
-  if(index == queue_sdom_ids.size()){
-    KripkeAbort("Cannot find subdomain id %d in work queue\n", sdom_id);
-  }
-
-  return index;
-}
-
-
-Subdomain *ParallelComm::dequeueSubdomain(int sdom_id){
-  int index = findSubdomain(sdom_id);
-
-  // Get subdomain pointer before removing it from queue
-  Subdomain *sdom = queue_subdomains[index];
-
-  // remove subdomain from queue
-  queue_sdom_ids.erase(queue_sdom_ids.begin()+index);
-  queue_subdomains.erase(queue_subdomains.begin()+index);
-  queue_depends.erase(queue_depends.begin()+index);
-
-  return sdom;
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-  All recieves use the plane_data[] arrays as recieve buffers.
-*/
-void ParallelComm::postRecvs(int sdom_id, Subdomain &sdom){
-  int mpi_rank=0;
-#ifdef KRIPKE_USE_MPI
-  int mpi_size=1;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-
-  // go thru each dimensions upwind neighbors, and add the dependencies
-  int num_depends = 0;
-  for(int dim = 0;dim < 3;++ dim){
-    // If it's a boundary condition, skip it
-    if(sdom.upwind[dim].mpi_rank < 0){
-      continue;
-    }
-
-    // If it's an on-rank communication (from another subdomain)
-    if(sdom.upwind[dim].mpi_rank == mpi_rank){
-      // skip it, but track the dependency
-      num_depends ++;
-      continue;
-    }
-
-#ifdef KRIPKE_USE_MPI
-    // Add request to pending list
-    recv_requests.push_back(MPI_Request());
-    recv_subdomains.push_back(sdom_id);
-
-    // compute the tag id of THIS subdomain (tags are always based on destination)
-    int tag = computeTag(sdom.upwind[dim].mpi_rank, sdom.upwind[dim].subdomain_id);
-
-    // Post the recieve
-    MPI_Irecv(sdom.plane_data[dim]->ptr(), sdom.plane_data[dim]->elements, MPI_DOUBLE, sdom.upwind[dim].mpi_rank,
-      tag, MPI_COMM_WORLD, &recv_requests[recv_requests.size()-1]);
-
-    // increment number of dependencies
-    num_depends ++;
-#endif
-  }
-
-  // add subdomain to queue
-  queue_sdom_ids.push_back(sdom_id);
-  queue_subdomains.push_back(&sdom);
-  queue_depends.push_back(num_depends);
-}
-
-void ParallelComm::postSends(Subdomain *sdom, double *src_buffers[3]){
-  // post sends for downwind dependencies
-  int mpi_rank=0;
-#ifdef KRIPKE_USE_MPI
-  int mpi_size=1;
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-  for(int dim = 0;dim < 3;++ dim){
-    // If it's a boundary condition, skip it
-    if(sdom->downwind[dim].mpi_rank < 0){
-      continue;
-    }
-
-    // If it's an on-rank communication (to another subdomain)
-    if(sdom->downwind[dim].mpi_rank == mpi_rank){
-      // find the local subdomain in the queue, and decrement the counter
-      for(int i = 0;i < queue_sdom_ids.size();++ i){
-        if(queue_sdom_ids[i] == sdom->downwind[dim].subdomain_id){
-          queue_depends[i] --;
-          break;
-        }
-      }
-
-      // copy the boundary condition data into the downwinds plane data
-      Subdomain &sdom_downwind = grid_data->subdomains[sdom->downwind[dim].subdomain_id];
-      sdom_downwind.plane_data[dim]->copy(*sdom->plane_data[dim]);
-      int num_elem = sdom_downwind.plane_data[dim]->elements;
-      //double const * KRESTRICT src_ptr = sdom->plane_data[dim]->ptr();
-      double * KRESTRICT dst_ptr = sdom_downwind.plane_data[dim]->ptr();
-      for(int i = 0;i < num_elem;++ i){
-        dst_ptr[i] = src_buffers[dim][i];
-      }
-      continue;
-    }
-#ifdef KRIPKE_USE_MPI
-    // At this point, we know that we have to send an MPI message
-    // Add request to send queue
-    send_requests.push_back(MPI_Request());
-
-    // compute the tag id of TARGET subdomain (tags are always based on destination)
-    int tag = computeTag(mpi_rank, sdom->downwind[dim].subdomain_id);
-
-    // Post the send
-    MPI_Isend(src_buffers[dim], sdom->plane_data[dim]->elements, MPI_DOUBLE, sdom->downwind[dim].mpi_rank,
-      tag, MPI_COMM_WORLD, &send_requests[send_requests.size()-1]);
-#endif
-  }
-}
-
-
-// Checks if there are any outstanding subdomains to complete
-bool ParallelComm::workRemaining(void){
-#ifdef KRIPKE_USE_MPI
-  return (recv_requests.size() > 0 || queue_subdomains.size() > 0);
-#else
-  return (queue_subdomains.size() > 0);
-#endif
-}
-
-
-// Blocks until all sends have completed, and flushes the send queues
-void ParallelComm::waitAllSends(void){
-#ifdef KRIPKE_USE_MPI
-  // Wait for all remaining sends to complete, then return false
-  int num_sends = send_requests.size();
-  if(num_sends > 0){
-    std::vector<MPI_Status> status(num_sends);
-    MPI_Waitall(num_sends, &send_requests[0], &status[0]);
-    send_requests.clear();
-  }
-#endif
-}
-
-/**
-  Checks for incomming messages, and does relevant bookkeeping.
-*/
-void ParallelComm::testRecieves(void){
-
-#ifdef KRIPKE_USE_MPI
-  // Check for any recv requests that have completed
-  int num_requests = recv_requests.size();
-  bool done = false;
-  while(!done && num_requests > 0){
-    // Create array of status variables
-    std::vector<MPI_Status> recv_status(num_requests);
-
-    // Ask if either one or none of the recvs have completed?
-    int index; // this will be the index of request that completed
-    int complete_flag; // this is set to TRUE if somthing completed
-    MPI_Testany(num_requests, &recv_requests[0], &index, &complete_flag, &recv_status[0]);
-
-    if(complete_flag != 0){
-
-      // get subdomain that this completed for
-      int sdom_id = recv_subdomains[index];
-
-      // remove the request from the list
-      recv_requests.erase(recv_requests.begin()+index);
-      recv_subdomains.erase(recv_subdomains.begin()+index);
-      num_requests --;
-
-      // decrement the dependency count for that subdomain
-      for(int i = 0;i < queue_sdom_ids.size();++ i){
-        if(queue_sdom_ids[i] == sdom_id){
-          queue_depends[i] --;
-          break;
-        }
-      }
-    }
-    else{
-      done = true;
-    }
-  }
-#endif
-}
-
-
-std::vector<int> ParallelComm::getReadyList(void){
-  // build up a list of ready subdomains
-  std::vector<int> ready;
-  for(int i = 0;i < queue_depends.size();++ i){
-    if(queue_depends[i] == 0){
-      ready.push_back(queue_sdom_ids[i]);
-    }
-  }
-  return ready;
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.h
deleted file mode 100644
index 7d7f40254..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_COMM_H__
-#define KRIPKE_COMM_H__
-
-#include<vector>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-struct Grid_Data;
-struct Subdomain;
-
-class ParallelComm {
-  public:
-    explicit ParallelComm(Grid_Data *grid_data_ptr);
-    virtual ~ParallelComm();
-
-    // Adds a subdomain to the work queue
-    virtual void addSubdomain(int sdom_id, Subdomain &sdom) = 0;
-
-    // Checks if there are any outstanding subdomains to complete
-    // false indicates all work is done, and all sends have completed
-    virtual bool workRemaining(void);
-
-    // Returns a vector of ready subdomains, and clears them from the ready queue
-    virtual std::vector<int> readySubdomains(void) = 0;
-
-    // Marks subdomains as complete, and performs downwind communication
-    virtual void markComplete(int sdom_id) = 0;
-
-  protected:
-    static int computeTag(int mpi_rank, int sdom_id);
-    static void computeRankSdom(int tag, int &mpi_rank, int &sdom_id);
-
-    int findSubdomain(int sdom_id);
-    Subdomain *dequeueSubdomain(int sdom_id);
-    void postRecvs(int sdom_id, Subdomain &sdom);
-    void postSends(Subdomain *sdom, double *buffers[3]);
-    void testRecieves(void);
-    void waitAllSends(void);
-    std::vector<int> getReadyList(void);
-
-
-    Grid_Data *grid_data;
-
-    // These vectors contian the recieve requests
-#ifdef KRIPKE_USE_MPI
-    std::vector<MPI_Request> recv_requests;
-#endif
-    std::vector<int> recv_subdomains;
-
-    // These vectors have the subdomains, and the remaining dependencies
-    std::vector<int> queue_sdom_ids;
-    std::vector<Subdomain *> queue_subdomains;
-    std::vector<int> queue_depends;
-
-    // These vectors have the remaining send requests that are incomplete
-#ifdef KRIPKE_USE_MPI
-    std::vector<MPI_Request> send_requests;
-#endif
-};
-
-
-class SweepComm : public ParallelComm {
-  public:
-    explicit SweepComm(Grid_Data *data);
-    virtual ~SweepComm();
-
-    virtual void addSubdomain(int sdom_id, Subdomain &sdom);
-    virtual bool workRemaining(void);
-    virtual std::vector<int> readySubdomains(void);
-    virtual void markComplete(int sdom_id);
-};
-
-
-class BlockJacobiComm : public ParallelComm {
-  public:
-    explicit BlockJacobiComm(Grid_Data *data);
-    virtual ~BlockJacobiComm();
-
-    void addSubdomain(int sdom_id, Subdomain &sdom);
-    bool workRemaining(void);
-    std::vector<int> readySubdomains(void);
-    void markComplete(int sdom_id);
-
-  private:
-    bool posted_sends;
-};
-
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/BlockJacobiComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/BlockJacobiComm.cpp
deleted file mode 100644
index 8dd48cb86..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/BlockJacobiComm.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/ParallelComm.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Grid.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <vector>
-#include <stdio.h>
-
-
-BlockJacobiComm::BlockJacobiComm(Grid_Data *data) : ParallelComm(data), posted_sends(false)
-{
-
-}
-
-BlockJacobiComm::~BlockJacobiComm(){
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-*/
-void BlockJacobiComm::addSubdomain(int sdom_id, Subdomain &sdom){
-  // Copy old flux data to send buffers
-  for(int dim = 0;dim < 3;++ dim){
-    int nelem = sdom.plane_data[dim]->elements;
-    double const * KRESTRICT src = sdom.plane_data[dim]->ptr();
-    double * KRESTRICT dst = sdom.old_plane_data[dim]->ptr();
-    for(int i = 0;i < nelem;++ i){
-      dst[i] = src[i];
-    }
-  }
-
-  // post recieves
-  postRecvs(sdom_id, sdom);
-
-}
-
-// Checks if there are any outstanding subdomains to complete
-// false indicates all work is done, and all sends have completed
-bool BlockJacobiComm::workRemaining(void){
-  if(!posted_sends){
-    // post sends for all queued subdomains
-    for(int i = 0;i < queue_subdomains.size();++ i){
-      Subdomain *sdom = queue_subdomains[i];
-
-      // Send new downwind info for sweep
-      double *buf[3] = {
-        sdom->old_plane_data[0]->ptr(),
-        sdom->old_plane_data[1]->ptr(),
-        sdom->old_plane_data[2]->ptr()
-      };
-
-      postSends(sdom, buf);
-    }
-    posted_sends = true;
-  }
-  // Since we communicate fluxes before local sweeps, when we are
-  // out of work, there is no further synchronization
-  if(ParallelComm::workRemaining()){
-    return true;
-  }
-  waitAllSends();
-
-  return false;
-}
-
-/**
-  Checks for incomming messages, and returns a list of ready subdomain id's
-*/
-std::vector<int> BlockJacobiComm::readySubdomains(void){
-  testRecieves();
-
-  // return list of any ready subdomains
-  return getReadyList();
-}
-
-
-
-void BlockJacobiComm::markComplete(int sdom_id){
-  // remove subdomain from work queue
-  dequeueSubdomain(sdom_id);
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/SweepComm.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/SweepComm.cpp
deleted file mode 100644
index 934276008..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/ParallelComm/SweepComm.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/ParallelComm.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Grid.h>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <vector>
-#include <stdio.h>
-
-
-SweepComm::SweepComm(Grid_Data *data) : ParallelComm(data)
-{
-
-}
-
-SweepComm::~SweepComm(){
-}
-
-/**
-  Adds a subdomain to the work queue.
-  Determines if upwind dependencies require communication, and posts appropirate Irecv's.
-*/
-void SweepComm::addSubdomain(int sdom_id, Subdomain &sdom){
-  // Post recieves for upwind dependencies, and add to the queue
-  postRecvs(sdom_id, sdom);
-}
-
-
-// Checks if there are any outstanding subdomains to complete
-// false indicates all work is done, and all sends have completed
-bool SweepComm::workRemaining(void){
-  // If there are outstanding subdomains to process, return true
-  if(ParallelComm::workRemaining()){
-    return true;
-  }
-
-  // No more work, so make sure all of our sends have completed
-  // before we continue
-  waitAllSends();
-
-  return false;
-}
-
-
-/**
-  Checks for incomming messages, and returns a list of ready subdomain id's
-*/
-std::vector<int> SweepComm::readySubdomains(void){
-  // check for incomming messages
-  testRecieves();
-
-  // build up a list of ready subdomains
-  return getReadyList();
-}
-
-
-void SweepComm::markComplete(int sdom_id){
-  // Get subdomain pointer and remove from work queue
-  Subdomain *sdom = dequeueSubdomain(sdom_id);
-
-  // Send new downwind info for sweep
-  double *buf[3] = {
-    sdom->plane_data[0]->ptr(),
-    sdom->plane_data[1]->ptr(),
-    sdom->plane_data[2]->ptr()
-  };
-  postSends(sdom, buf);
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/SubTVec.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/SubTVec.h
deleted file mode 100644
index eb3ca9242..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/SubTVec.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_SUBTVEC_H__
-#define KRIPKE_SUBTVEC_H__
-
-#define KRIPKE_ALIGN_DATA
-
-#define KRIPKE_ALIGN 64
-
-#include <Kripke/Kernel.h>
-#include <algorithm>
-#include <vector>
-#include <stdlib.h>
-
-/**
- *  A transport vector (used for Psi and Phi, RHS, etc.)
- *
- *  This provides the inner most three strides of
- *    Psi[GS][DS][G][D][Z]
- *  but in whatever nesting order is specified.
- */
-struct SubTVec {
-private:
-  // disallow
-  SubTVec(SubTVec const &c);
-  SubTVec &operator=(SubTVec const &c);
-
-public:
-  SubTVec(Nesting_Order nesting, int ngrps, int ndir_mom, int nzones):
-    groups(ngrps),
-    directions(ndir_mom),
-    zones(nzones),
-    elements(groups*directions*zones),
-    data_linear(NULL)
-  {
-//#ifdef RAJA_ENABLE_CUDA
-    
-#ifdef KRIPKE_ALIGN_DATA
-    int status = posix_memalign((void**)&data_linear, KRIPKE_ALIGN, sizeof(double)*elements);
-    if(status != 0){
-    	printf("Error allocating data\n");
-    	data_linear = NULL;
-    }
-#else
-    data_linear = (double *) malloc(sizeof(double)*elements);
-#endif // align
-//#endif // cuda
-    setupIndices(nesting, data_linear);
-  }
-
-
-  /**
-   * ALIASING version of constructor.
-   * Use this when you have a data buffer already, and don't want this class
-   * to do any memory management.
-   */
-  SubTVec(Nesting_Order nesting, int ngrps, int ndir_mom, int nzones, double *ptr):
-    groups(ngrps),
-    directions(ndir_mom),
-    zones(nzones),
-    elements(groups*directions*zones),
-    data_linear(NULL)
-  {
-    setupIndices(nesting, ptr);
-  }
-
-  ~SubTVec(){
-    if(data_linear != NULL){
-      free(data_linear);
-    }
-  }
-
-  void setupIndices(Nesting_Order nesting, double *ptr){
-    // setup nesting order
-    switch(nesting){
-      case NEST_GDZ:
-        ext_to_int[0] = 0;
-        ext_to_int[1] = 1;
-        ext_to_int[2] = 2;
-        break;
-      case NEST_GZD:
-        ext_to_int[0] = 0;
-        ext_to_int[2] = 1;
-        ext_to_int[1] = 2;
-        break;
-      case NEST_DZG:
-        ext_to_int[1] = 0;
-        ext_to_int[2] = 1;
-        ext_to_int[0] = 2;
-        break;
-      case NEST_DGZ:
-        ext_to_int[1] = 0;
-        ext_to_int[0] = 1;
-        ext_to_int[2] = 2;
-        break;
-      case NEST_ZDG:
-        ext_to_int[2] = 0;
-        ext_to_int[1] = 1;
-        ext_to_int[0] = 2;
-        break;
-      case NEST_ZGD:
-        ext_to_int[2] = 0;
-        ext_to_int[0] = 1;
-        ext_to_int[1] = 2;
-        break;
-    }
-
-    // setup dimensionality
-    int size_ext[3];
-    size_ext[0] = groups;
-    size_ext[1] = directions;
-    size_ext[2] = zones;
-
-    // map to internal indices
-    for(int i = 0; i < 3; ++i){
-      size_int[ext_to_int[i]] = size_ext[i];
-    }
-
-    data_pointer = ptr;
-  }
-
-  inline double* ptr(void){
-    return data_pointer;
-  }
-
-  inline double* ptr(int g, int d, int z){
-    return &(*this)(g,d,z);
-  }
-
-  // These are NOT efficient.. just used to re-stride data for comparisons
-  inline double &operator()(int g, int d, int z) {
-    int idx[3];
-    idx[ext_to_int[0]] = g;
-    idx[ext_to_int[1]] = d;
-    idx[ext_to_int[2]] = z;
-    int offset = idx[0] * size_int[1]*size_int[2] +
-                 idx[1] * size_int[2] +
-                 idx[2];
-    return data_pointer[offset];
-  }
-  inline double operator()(int g, int d, int z) const {
-    return (*const_cast<SubTVec*>(this))(g,d,z);
-  }
-
-  inline double sum(void) const {
-    double s = 0.0;
-    for(size_t i = 0;i < elements;++ i){
-      s+= data_linear[i];
-    }
-    return s;
-  }
-
-  inline void clear(double v){
-#ifdef KRIPKE_USE_OPENMP
-#pragma omp parallel for
-#endif
-    for(int i = 0;i < elements;++ i){
-      data_linear[i] = v;
-    }
-  }
-
-  inline void randomizeData(void){
-    for(int i = 0;i < elements;++ i){
-      data_linear[i] = drand48();
-    }
-  }
-
-  inline void copy(SubTVec const &b){
-    for(int g = 0;g < groups;++ g){
-      for(int d = 0;d < directions; ++ d){
-        for(int z = 0;z < zones;++ z){
-          // Copy using abstract indexing
-          (*this)(g,d,z) = b(g,d,z);
-        }
-      }
-    }
-  }
-
-  inline bool compare(std::string const &name, SubTVec const &b,
-      double tol, bool verbose){
-
-    bool is_diff = false;
-    int num_wrong = 0;
-    for(int g = 0;g < groups;++ g){
-      for(int d = 0;d < directions; ++ d){
-        for(int z = 0;z < zones;++ z){
-          // Copy using abstract indexing
-          double err = std::abs((*this)(g,d,z) - b(g,d,z));
-          if(err > tol){
-            is_diff = true;
-            if(verbose){
-              printf("%s[g=%d, d=%d, z=%d]: |%e - %e| = %e\n",
-                  name.c_str(), g,d,z, (*this)(g,d,z), b(g,d,z), err);
-              num_wrong ++;
-              if(num_wrong > 100){
-                return true;
-              }
-            }
-          }
-        }
-      }
-    }
-    return is_diff;
-  }
-
-  int ext_to_int[3]; // external index to internal index mapping
-  int size_int[3]; // size of each dimension in internal indices
-
-  int groups, directions, zones, elements;
-  double *data_pointer;
-  double *data_linear;
-};
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.cpp
deleted file mode 100644
index 69a16bab9..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.cpp
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/Input_Variables.h>
-
-#include <cmath>
-#include <sstream>
-
-
-namespace {
-  /**
-    This function defined the material distribution in space.
-    This defines Problem 3 from Kobayashi
-    Where Region 1 is material 0, 2 is 1 and 3 is 2.
-  */
-  inline int queryMaterial(double x, double y, double z){
-    // Problem is defined for one octant, with reflecting boundaries
-    // We "unreflect" it here by taking abs values
-    x = std::abs(x);
-    y = std::abs(y);
-    z = std::abs(z);
-
-    // Central 20x20x20 box is Region 1
-    if(x <= 10.0 && y <= 10.0 && z <= 10.0){
-      return 0;
-    }
-
-    // Leg 1 of Region 2
-    if(x <= 10.0 && y <= 60.0 && z <= 10.0){
-      return 1;
-    }
-
-    // Leg 2 of Region 2
-    if(x <= 40.0 && y >= 50.0 && y <= 60.0 && z <= 10.0){
-      return 1;
-    }
-
-    // Leg 3 of Region 2
-    if(x >= 30.0 && x <= 40.0 && y >= 50.0 && y <= 60.0 && z <= 40.0){
-      return 1;
-    }
-
-    // Leg 4 of Region 2
-    if(x >= 30.0 && x <= 40.0 && y >= 50.0 && z >= 30.0 && z <= 40.0){
-      return 1;
-    }
-
-    // Rest is filled with region 3
-    return 2;
-  }
-}
-
-
-
-Subdomain::Subdomain() :
-  idx_dir_set(0),
-  idx_group_set(0),
-  idx_zone_set(0),
-  num_groups(0),
-  num_directions(0),
-  num_zones(0),
-  group0(0),
-  direction0(0),
-  psi(NULL),
-  rhs(NULL),
-  sigt(NULL),
-  directions(NULL),
-  ell(NULL),
-  ell_plus(NULL),
-  phi(NULL),
-  phi_out(NULL)
-{
-  for(int dim = 0;dim < 3;++ dim){
-    plane_data[dim] = NULL;
-    old_plane_data[dim] = NULL;
-  }
-}
-Subdomain::~Subdomain(){
-  delete psi;
-  delete rhs;
-  delete sigt;
-  for(int dim = 0;dim < 3;++ dim){
-    delete plane_data[dim];
-    delete old_plane_data[dim];
-  }
-}
-
-
-/**
-  Setup subdomain and allocate data
-*/
-void Subdomain::setup(int sdom_id, Input_Variables *input_vars, int gs, int ds, int zs,
-    std::vector<Directions> &direction_list, Kernel *kernel, Layout *layout)
-{
-  // set the set indices
-  idx_group_set = gs;
-  idx_dir_set = ds;
-  idx_zone_set = zs;
-
-  num_groups = input_vars->num_groups / input_vars->num_groupsets;
-  group0 = gs * num_groups;
-
-  num_directions = input_vars->num_directions / input_vars->num_dirsets;
-  direction0 = ds * num_directions;
-  directions = &direction_list[direction0];
-
-  num_zones = 1;
-  for(int dim = 0;dim < 3;++ dim){
-    // Compute number of zones in this dimension
-    nzones[dim] = layout->getNumZones(sdom_id, dim);
-    num_zones *= nzones[dim];
-
-    // Compute grid deltas in this dimension (including ghost zone deltas)
-    std::pair<double, double> dim_extent = layout->getSpatialExtents(sdom_id, dim);
-    zeros[dim] = dim_extent.first;
-    double dx = (dim_extent.second-dim_extent.first)/(double)nzones[dim];
-    deltas[dim].resize(nzones[dim]+2);
-    for(int z = 0;z < nzones[dim]+2;++ z){
-      deltas[dim][z] = dx;
-    }
-  }
-
-  // allocate storage for the sweep boundary data (upwind and downwind share same buffer)
-  plane_data[0] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[1] * nzones[2]);
-  plane_data[1] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[2]);
-  plane_data[2] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[1]);
-
-  // For block-jacobi parallel method
-  old_plane_data[0] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[1] * nzones[2]);
-  old_plane_data[1] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[2]);
-  old_plane_data[2] = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, nzones[0] * nzones[1]);
-
-  // allocate the storage for solution and source terms
-  psi = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, num_zones);
-  psi->clear(0.0);
-  rhs = new SubTVec(kernel->nestingPsi(), num_groups, num_directions, num_zones);
-  sigt = new SubTVec(kernel->nestingSigt(), num_groups, 1, num_zones);
-  sigt->clear(0.0);
-
-  computeSweepIndexSet();
-
-  // Setup neighbor data
-  int dirs[3] = { directions[0].id, directions[0].jd, directions[0].kd};
-  for(int dim = 0;dim < 3;++ dim){
-    downwind[dim] = layout->getNeighbor(sdom_id, dim, dirs[dim]);
-    upwind[dim] = layout->getNeighbor(sdom_id, dim, -1 * dirs[dim]);
-  }
-
-  // paint the mesh
-  reg_volume[0] = 0.0;
-  reg_volume[1] = 0.0;
-  reg_volume[2] = 0.0;
-  int num_subsamples = 4; // number of subsamples per spatial dimension
-  double sample_vol_frac = 1.0 / (double)(num_subsamples*num_subsamples*num_subsamples);
-  int zone_id = 0;
-  double pz = zeros[2];
-
-  for (int k = 0; k < nzones[2]; k++) {
-    double sdz = deltas[2][k+1] / (double)(num_subsamples+1);
-    double py = zeros[1];
-
-    for (int j = 0; j != nzones[1]; j ++) {
-      double sdy = deltas[1][j+1] / (double)(num_subsamples+1);
-      double px = zeros[0];
-
-      for (int i = 0; i != nzones[0]; i ++) {
-        double sdx = deltas[0][i+1] / (double)(num_subsamples+1);
-
-        double zone_volume = deltas[0][i+1] * deltas[1][j+1] * deltas[2][k+1];
-        volume.push_back(zone_volume);
-
-        // subsample probe the geometry to get our materials
-        double frac[3] = {0.0, 0.0, 0.0}; // fraction of both materials
-        double spz = pz + sdz;
-
-        for(int sk = 0;sk < num_subsamples;++ sk){
-          double spy = py + sdy;
-          for(int sj = 0;sj < num_subsamples;++ sj){
-            double spx = px + sdx;
-            for(int si = 0;si < num_subsamples;++ si){
-
-              int mat = queryMaterial(spx, spy, spz);
-              frac[mat] += sample_vol_frac;
-
-              spx += sdx;
-            }
-            spy += sdy;
-          }
-          spz += sdz;
-        }
-
-        // Add material to zone
-        int nmixed = 0;
-        for(int mat = 0;mat < 3;++ mat){          
-          if(frac[mat] > 0.0){
-            nmixed ++;
-            if(nmixed == 1){
-              zones_to_mixed.push_back(mixed_to_zones.size());
-            }
-            mixed_to_zones.push_back(zone_id);
-            mixed_material.push_back(mat);
-            mixed_fraction.push_back(frac[mat]);
-            reg_volume[mat] += frac[mat] * zone_volume;
-            
-            // initialize background sigt
-            for(int g = 0;g < num_groups;++ g){
-              (*sigt)(g,0,zone_id) += frac[mat] * input_vars->sigt[mat];
-            }
-          }
-        }
-        num_mixed.push_back(nmixed);
-
-        // increment zone
-        px += deltas[0][i+1];
-        zone_id ++;
-      }
-      py += deltas[1][j+1];
-    }
-    pz += deltas[2][k+1];
-  }
-}
-
-void Subdomain::setVars(SubTVec *ell_ptr, SubTVec *ell_plus_ptr,
-    SubTVec *phi_ptr, SubTVec *phi_out_ptr){
-
-  ell = ell_ptr;
-  ell_plus = ell_plus_ptr;
-  phi = phi_ptr;
-  phi_out = phi_out_ptr;
-}
-
-
-/**
- * Randomizes data for a set.
- */
-void Subdomain::randomizeData(void){
-  psi->randomizeData();
-  rhs->randomizeData();
-  sigt->randomizeData();
-
-  for(int d = 0;d < 3;++ d){
-    for(int i = 0;i < deltas[d].size();++ i){
-      deltas[d][i] = drand48();
-    }
-  }
-}
-
-/**
- * Copies two sets, allowing for different nestings.
- */
-void Subdomain::copy(Subdomain const &b){
-  psi->copy(*b.psi);
-  rhs->copy(*b.rhs);
-  sigt->copy(*b.sigt);
-
-  for(int d = 0;d < 3;++ d){
-    deltas[d] = b.deltas[d];
-  }
-}
-
-/**
- * Compares two sets, allowing for different nestings.
- */
-bool Subdomain::compare(Subdomain const &b, double tol, bool verbose){
-  std::stringstream namess;
-  namess << "gdset[gs=" << idx_group_set << ", ds=" << idx_dir_set << ", zs=" << idx_zone_set << "]";
-  std::string name = namess.str();
-
-  bool is_diff = false;
-  is_diff |= psi->compare(name+".psi", *b.psi, tol, verbose);
-  is_diff |= rhs->compare(name+".rhs", *b.rhs, tol, verbose);
-  is_diff |= sigt->compare(name+".sigt", *b.sigt, tol, verbose);
-
-  is_diff |= compareVector(name+".deltas[0]", deltas[0], b.deltas[0], tol, verbose);
-  is_diff |= compareVector(name+".deltas[1]", deltas[1], b.deltas[1], tol, verbose);
-  is_diff |= compareVector(name+".deltas[2]", deltas[2], b.deltas[2], tol, verbose);
-
-  return is_diff;
-}
-
-/**
- * Compute sweep index sets.
- * Determines logical indices, and increments for i,j,k based on grid
- * information and quadrature set sweeping direction.
- */
-void Subdomain::computeSweepIndexSet(void){
-  if(directions[0].id > 0){
-    sweep_block.start_i = 0;
-    sweep_block.end_i = nzones[0];
-    sweep_block.inc_i = 1;
-  }
-  else {
-    sweep_block.start_i = nzones[0]-1;
-    sweep_block.end_i = -1;
-    sweep_block.inc_i = -1;
-  }
-
-  if(directions[0].jd > 0){
-    sweep_block.start_j = 0;
-    sweep_block.end_j = nzones[1];
-    sweep_block.inc_j = 1;
-  }
-  else {
-    sweep_block.start_j = nzones[1]-1;
-    sweep_block.end_j = -1;
-    sweep_block.inc_j = -1;
-  }
-
-  if(directions[0].kd > 0){
-    sweep_block.start_k = 0;
-    sweep_block.end_k = nzones[2];
-    sweep_block.inc_k =  1;
-  }
-  else {
-    sweep_block.start_k = nzones[2]-1;
-    sweep_block.end_k = -1;
-    sweep_block.inc_k = -1;
-  }
-}
-
-namespace {
-  double FactFcn(int n)
-  {
-    double fact = 1.0;
-    for(int i = n;i > 0 ;--i){
-      fact *= (double)i;
-    }
-    return(fact);
-  }
-
-  inline double PnmFcn(int n, int m, double x)
-  {
-    /*-----------------------------------------------------------------
-     * It is assumed that 0 <= m <= n and that abs(x) <= 1.0.
-     * No error checking is done, however.
-     *---------------------------------------------------------------*/
-    double fact, pnn, pmm, pmmp1, somx2;
-
-    int i, nn;
-
-    if(std::abs(x) > 1.0){
-      KripkeAbort("Bad input to ardra_PnmFcn: abs(x) > 1.0, x = %e\n", x);
-    }
-    else if((x > 1.0) && (x <= 1.0)){
-      x = 1.0;
-    }
-    else if((-1.0 <= x ) && (x < -1.0)){
-      x = -1.0;
-    }
-
-    pmm=1.0;
-    if(m > 0){
-      somx2=sqrt((1.0-x)*(1.0+x));
-      fact=1.0;
-      for(i=1; i<=m; i++){
-        pmm *= -fact*somx2;
-        fact += 2.0;
-      }
-    }
-    if(n == m){
-      return(pmm);
-    }
-    else {
-      pmmp1=x*(2*m+1)*pmm;
-      if(n == (m+1)){
-        return(pmmp1);
-      }
-      else {
-        for(nn=m+2; nn<=n; nn++){
-          pnn=(x*(2*nn-1)*pmmp1-(nn+m-1)*pmm)/(nn-m);
-          pmm=pmmp1;
-          pmmp1=pnn;
-        }
-        return(pnn);
-      }
-    }
-  }
-
-  inline double YnmFcn(int n, int m, double mu, double eta, double xi)
-  {
-    double fac1, fac2, anm, ynm, pnm, dm0, taum, tmp, phi, phi_tmp;
-    double floor=1.e-20;
-    int nn, mm;
-
-    /* Calculate the correct phi for omega=(mu,eta,xi) */
-    tmp = fabs(eta/(mu+floor));
-    phi_tmp = atan(tmp);
-    if( (mu>0) && (eta>0) ){
-      phi = phi_tmp;
-    }
-    else if( (mu<0) && (eta>0) ){
-      phi = M_PI - fabs(phi_tmp);
-    }
-    else if( (mu<0) && (eta<0) ){
-      phi = M_PI + fabs(phi_tmp);
-    }
-    else {
-      phi = 2.0*M_PI - fabs(phi_tmp);
-    }
-
-    /* Begin evaluation of Ynm(omega) */
-    nn = n - std::abs(m);
-    fac1 = (double) FactFcn(nn);
-    nn = n + std::abs(m);
-    fac2 = (double) FactFcn(nn);
-    mm = std::abs(m);
-    pnm = PnmFcn(n, mm, xi);
-    tmp = ((double) m)*phi;
-    if(m >= 0){
-      taum = cos(tmp);
-    }
-    else {taum = sin(-tmp); }
-    if(m == 0){
-      dm0 = 1.0;
-    }
-    else {dm0 = 0.0; }
-    tmp = ((2*n+1)*fac1)/(2.0*(1.0+dm0)*M_PI*fac2);
-    anm = sqrt( tmp );
-    ynm = anm*pnm*taum;
-    return(ynm);
-  }
-}
-
-/**
- * Compute L and L+
- * This assumes that the quadrature set is defined.
- */
-void Subdomain::computeLLPlus(int legendre_order){
-  double SQRT4PI = std::sqrt(4*M_PI);
-  for(int n=0, nm=0; n < legendre_order+1; n++){
-    for(int m=-n; m<=n; m++){
-      for(int d=0; d<num_directions; d++){
-        // Get quadrature point info
-        double xcos = (directions[d].id)*(directions[d].xcos);
-        double ycos = (directions[d].jd)*(directions[d].ycos);
-        double zcos = (directions[d].kd)*(directions[d].zcos);
-        double w =  directions[d].w;
-
-        double ynm = YnmFcn(n, m, xcos, ycos, zcos);
-
-        // Compute element of L and L+
-        (*ell)(nm,d,0) = w*ynm/SQRT4PI;
-        (*ell_plus)(nm,d,0) = ynm*SQRT4PI;
-      }
-      nm ++;
-    }
-  }
-}
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.h
deleted file mode 100644
index 6946b9a3b..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Subdomain.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_SUBDOMAIN_H__
-#define KRIPKE_SUBDOMAIN_H__
-
-#include <vector>
-#include <Kripke/Layout.h>
-
-// Foreward Decl
-struct Directions;
-struct SubTVec;
-struct Input_Variables;
-class Kernel;
-
-/**
- * Provides sweep index sets for a given octant.
- * This generalizes the sweep pattern, and allows for experimenting with
- * a tiled approach to on-node sweeps.
- */
-struct Grid_Sweep_Block {
-  int start_i, start_j, start_k; // starting index
-  int end_i, end_j, end_k; // termination conditon (one past)
-  int inc_i, inc_j, inc_k; // increment
-};
-
-
-
-/**
- * Contains parameters and variables that describe a single Group Set and
- * Direction Set.
- */
-struct Subdomain {
-  Subdomain();
-  ~Subdomain();
-
-  void setup(int sdom_id, Input_Variables *input_vars, int gs, int ds, int zs,
-    std::vector<Directions> &direction_list, Kernel *kernel, Layout *layout);
-
-  void setVars(SubTVec *ell_ptr, SubTVec *ell_plus_ptr,
-    SubTVec *phi_ptr, SubTVec *phi_out_ptr);
-
-  void randomizeData(void);
-  void copy(Subdomain const &b);
-  bool compare(Subdomain const &b, double tol, bool verbose);
-  void computeSweepIndexSet(void);
-  void computeLLPlus(int legendre_order);
-
-  int idx_group_set;
-  int idx_dir_set;
-  int idx_zone_set;
-
-  int num_groups;       // Number of groups in this set
-  int num_directions;   // Number of directions in this set
-  int num_zones;        // Number of zones in this set
-
-  double zeros[3];                     // origin of local mesh
-  int nzones[3];                    // Number of zones in each dimension
-  std::vector<double> deltas[3];    // Spatial grid deltas in each dimension (including ghost zones)
-
-  int group0;           // Starting global group id
-  int direction0;       // Starting global direction id
-
-  Grid_Sweep_Block sweep_block;
-
-  // Neighbors
-  Neighbor upwind[3];   // Upwind dependencies in x,y,z
-  Neighbor downwind[3]; // Downwind neighbors in x,y,z
-
-  // Sweep boundary data
-  SubTVec *plane_data[3];
-  SubTVec *old_plane_data[3];
-
-  // Variables
-  SubTVec *psi;         // Solution
-  SubTVec *rhs;         // RHS, source term
-  SubTVec *sigt;        // Zonal per-group cross-section
-
-  // Pointers into directions and directionset data from Grid_Data
-  Directions *directions;
-  SubTVec *ell;
-  SubTVec *ell_plus;
-  SubTVec *phi;
-  SubTVec *phi_out;
-
-  // Materials on the mesh, used for scattering lookup
-  double reg_volume[3];               // volume of each material region
-  std::vector<double> volume;         // volume of each zone
-  std::vector<int> mixed_to_zones;    // mapping from mixed slot to zones
-  std::vector<int> num_mixed;         // mapping from mixed slot to zones
-  std::vector<int> zones_to_mixed;    // mapping from zones to first mixed slot
-  std::vector<int> mixed_material;    // material number for each mixed slot
-  std::vector<double> mixed_fraction; // volume fraction each mixed slot
-};
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Sweep_Solver.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Sweep_Solver.cpp
deleted file mode 100644
index 0c321b082..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Sweep_Solver.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke.h>
-#include <Kripke/Subdomain.h>
-#include <Kripke/SubTVec.h>
-#include <Kripke/ParallelComm.h>
-#include <Kripke/Grid.h>
-#include <vector>
-#include <stdio.h>
-
-#ifdef KRIPKE_USE_MPI
-#include<mpi.h>
-#endif
-
-/**
-  Run solver iterations.
-*/
-int SweepSolver (Grid_Data *grid_data, bool block_jacobi)
-{
-  Kernel *kernel = grid_data->kernel;
-
-  int mpi_rank = 0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-#endif
-
-  BLOCK_TIMER(grid_data->timing, Solve);
-
-
-  // Loop over iterations
-  double part_last = 0.0;
-  for(int iter = 0;iter < grid_data->niter;++ iter){
-
-    /*
-     * Compute the RHS:  rhs = LPlus*S*L*psi + Q
-     */
-
-    // Discrete to Moments transformation (phi = L*psi)
-    {
-      BLOCK_TIMER(grid_data->timing, LTimes);
-      kernel->LTimes(grid_data);
-    }
-
-    // Compute Scattering Source Term (psi_out = S*phi)
-    {
-      BLOCK_TIMER(grid_data->timing, Scattering);
-      kernel->scattering(grid_data);
-    }
-
-    // Compute External Source Term (psi_out = psi_out + Q)
-    {
-      BLOCK_TIMER(grid_data->timing, Source);
-      kernel->source(grid_data);
-    }
-
-    // Moments to Discrete transformation (rhs = LPlus*psi_out)
-    {
-      BLOCK_TIMER(grid_data->timing, LPlusTimes);
-      kernel->LPlusTimes(grid_data);
-    }
-
-    /*
-     * Sweep (psi = Hinv*rhs)
-     */
-    {
-      BLOCK_TIMER(grid_data->timing, Sweep);
-
-      if(true){
-        // Create a list of all groups
-        std::vector<int> sdom_list(grid_data->subdomains.size());
-        for(int i = 0;i < grid_data->subdomains.size();++ i){
-          sdom_list[i] = i;
-        }
-
-        // Sweep everything
-        SweepSubdomains(sdom_list, grid_data, block_jacobi);
-      }
-      // This is the ARDRA version, doing each groupset sweep independently
-      else{
-        for(int group_set = 0;group_set < grid_data->num_group_sets;++ group_set){
-          std::vector<int> sdom_list;
-          // Add all subdomains for this groupset
-          for(int s = 0;s < grid_data->subdomains.size();++ s){
-            if(grid_data->subdomains[s].idx_group_set == group_set){
-              sdom_list.push_back(s);
-            }
-          }
-
-          // Sweep the groupset
-          SweepSubdomains(sdom_list, grid_data, block_jacobi);
-        }
-      }
-    }
-
-    {
-      BLOCK_TIMER(grid_data->timing, ParticleEdit);
-      double part = grid_data->particleEdit();
-      if(mpi_rank==0){
-        printf("iter %d: particle count=%e, change=%e\n", iter, part, (part-part_last)/part);
-      }
-      part_last = part;
-    }
-  }
-  return(0);
-}
-
-
-
-/**
-  Perform full parallel sweep algorithm on subset of subdomains.
-*/
-void SweepSubdomains (std::vector<int> subdomain_list, Grid_Data *grid_data, bool block_jacobi)
-{
-  // Create a new sweep communicator object
-  ParallelComm *comm = NULL;
-  if(block_jacobi){
-    comm = new BlockJacobiComm(grid_data);
-  }
-  else {
-    comm = new SweepComm(grid_data);
-  }
-
-  // Add all subdomains in our list
-  for(int i = 0;i < subdomain_list.size();++ i){
-    int sdom_id = subdomain_list[i];
-    comm->addSubdomain(sdom_id, grid_data->subdomains[sdom_id]);
-  }
-
-  /* Loop until we have finished all of our work */
-  while(comm->workRemaining()){
-
-    // Get a list of subdomains that have met dependencies
-    std::vector<int> sdom_ready = comm->readySubdomains();
-    int backlog = sdom_ready.size();
-
-    // Run top of list
-    if(backlog > 0){
-      int sdom_id = sdom_ready[0];
-      Subdomain &sdom = grid_data->subdomains[sdom_id];
-      // Clear boundary conditions
-      for(int dim = 0;dim < 3;++ dim){
-        if(sdom.upwind[dim].subdomain_id == -1){
-          sdom.plane_data[dim]->clear(0.0);
-        }
-      }
-      {
-        BLOCK_TIMER(grid_data->timing, Sweep_Kernel);
-        // Perform subdomain sweep
-        grid_data->kernel->sweep(&sdom);
-      }
-
-      // Mark as complete (and do any communication)
-      comm->markComplete(sdom_id);
-    }
-  }
-
-  delete comm;
-}
-
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.cpp
deleted file mode 100644
index 1ee5134bb..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/Test/TestKernels.h>
-
-#include <Kripke.h>
-#include <Kripke/Grid.h>
-#include <Kripke/Input_Variables.h>
-
-/**
- * Functional object to run the LTimes kernel.
- */
-struct runLTimes {
-  std::string name(void) const { return "LTimes"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->LTimes(grid_data);
-  }
-};
-
-/**
- * Functional object to run the LPlusTimes kernel.
- */
-struct runLPlusTimes {
-  std::string name(void) const { return "LPlusTimes"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->LPlusTimes(grid_data);
-  }
-};
-
-
-/**
- * Functional object to run the scattering kernel.
- */
-struct runScattering {
-  std::string name(void) const { return "scattering"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->scattering(grid_data);
-  }
-};
-
-
-/**
- * Functional object to run the source kernel.
- */
-struct runSource {
-  std::string name(void) const { return "source"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    grid_data->kernel->source(grid_data);
-  }
-};
-
-/**
- * Functional object to run the MPI sweep and sweep kernels
- */
-struct runSweep {
-  std::string name(void) const { return "Sweep"; }
-
-  void operator ()(Grid_Data *grid_data) const {
-    std::vector<int> sdom_list(grid_data->subdomains.size());
-    for(int i = 0;i < grid_data->subdomains.size();++ i){
-      sdom_list[i] = i;
-    }
-    SweepSubdomains(sdom_list, grid_data, false);
-  }
-};
-
-
-/**
- * Tests a specific kernel (using one of the above runXXX functional objects).
- */
-template<typename KernelRunner>
-void testKernel(Input_Variables &input_variables){
-  int myid=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
-#endif
-
-  KernelRunner kr;
-
-  if(myid == 0){
-    printf("  Comparing %s to %s for kernel %s\n",
-      nestingString(NEST_GDZ).c_str(),
-      nestingString(input_variables.nesting).c_str(),
-      kr.name().c_str());
-  }
-
-  // Allocate two problems (one reference)
-  if(myid == 0)printf("    -- allocating\n");
-  Grid_Data *grid_data = new Grid_Data(&input_variables);
-
-  Nesting_Order old_nest = input_variables.nesting;
-  input_variables.nesting = NEST_GDZ;
-  Grid_Data *ref_data = new Grid_Data(&input_variables);
-  input_variables.nesting = old_nest;
-
-  // Generate random data in the reference problem, and copy it to the other
-  if(myid == 0)printf("    -- randomizing data\n");
-  ref_data->randomizeData();
-  grid_data->copy(*ref_data);
-
-  if(myid == 0)printf("    -- running kernels\n");
-
-  // Run both kernels
-  kr(ref_data);
-  kr(grid_data);
-
-  if(myid == 0)printf("    -- comparing results\n");
-  // Compare differences
-  bool is_diff = ref_data->compare(*grid_data, 1e-12, true);
-  if(is_diff){
-    if(myid == 0)KripkeAbort("Differences found, bailing out\n");
-  }
-
-  // Cleanup
-  if(myid == 0)printf("    -- OK\n\n");
-  delete grid_data;
-  delete ref_data;
-}
-
-
-/**
- * Tests all kernels given the specified input.
- */
-void testKernels(Input_Variables &input_variables){
-  // Run LTimes
-  testKernel<runLTimes>(input_variables);
-
-  // Run LPlusTimes
-  testKernel<runLPlusTimes>(input_variables);
-
-  // Run Scattering
-  testKernel<runScattering>(input_variables);
-
-  // Run Source
-  testKernel<runSource>(input_variables);
-
-  // Run Sweep
-  testKernel<runSweep>(input_variables);
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.h
deleted file mode 100644
index 2330e657d..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Test/TestKernels.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_TOOLS_TEST_KERNELS_H__
-#define KRIPKE_TOOLS_TEST_KERNELS_H__
-
-struct Input_Variables;
-
-void testKernels(Input_Variables &input_variables);
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.cpp b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.cpp
deleted file mode 100644
index 2bb01dcc3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#include <Kripke/Timing.h>
-#ifdef RAJA_USE_CALIPER
-#include <caliper/Annotation.h>
-#endif
-
-#include<Kripke.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <vector>
-#include <sstream>
-
-#ifdef KRIPKE_USE_MPI
-#include <mpi.h>
-#endif
-
-#ifdef KRIPKE_USE_BGPM
-extern "C" void HPM_Start(char const *);
-extern "C" void HPM_Stop(char const *);
-#endif
-
-
-#ifdef KRIPKE_USE_PAPI
-#include <papi.h>
-#endif
-
-
-Timing::~Timing(){
-#ifdef KRIPKE_USE_PAPI
-long long tmp[16];
-PAPI_stop(papi_set, tmp);
-#endif
-
-}
-
-void Timing::start(std::string const &name){
-  // get or create timer
-  Timer &timer = timers[name];
-
-  if(!timer.started){
-    timer.started = true;
-    timer.start_time = getTime();
-
-#ifdef KRIPKE_USE_PAPI
-    int num_papi = papi_event.size();
-    if(num_papi > 0){
-      if(timer.papi_total.size() == 0){
-        timer.papi_start_values.resize(num_papi, 0);
-        timer.papi_total.resize(num_papi, 0);
-      }
-
-      /*
-      // start timers
-      PAPI_start_counters(&papi_event[0], num_papi);
-
-      // clear timers
-      long long tmp[16];
-      PAPI_read_counters(tmp, num_papi);
-      */
-
-      // read initial values
-      PAPI_read(papi_set, &timer.papi_start_values[0]);
-
-    }
-#endif
-
-#ifdef RAJA_USE_CALIPER
-    cali::Annotation(name.c_str()).begin();
-#endif
-#ifdef KRIPKE_USE_BGPM
-    HPM_Start(name.c_str());
-#endif
-  }
-}
-
-void Timing::stop(std::string const &name){
-  // get or create timer
-  Timer &timer = timers[name];
-
-#ifdef KRIPKE_USE_BGPM
-    HPM_Stop(name.c_str());
-#endif
-
-#ifdef RAJA_USE_CALIPER
-    cali::Annotation(name.c_str()).end();
-#endif
-  if(timer.started){
-#ifdef KRIPKE_USE_PAPI
-    int num_papi = papi_event.size();
-    if(num_papi > 0){
-      // read timers
-      long long tmp[16];
-      //PAPI_stop_counters(tmp, num_papi);
-      PAPI_read(papi_set, tmp);
-
-      // accumulate to all started timers (since this clears the PAPI values)
-      for(int i = 0;i < num_papi;++ i){
-        timer.papi_total[i] += tmp[i] - timer.papi_start_values[i];
-      }
-
-    }
-#endif
-
-    // Stop the timer
-    timer.started = false;
-    timer.total_time += getTime() - timer.start_time;
-    timer.count ++;
-
-  }
-}
-
-void Timing::stopAll(void){
-  for(TimerMap::iterator i = timers.begin();i != timers.end();++ i){
-    stop((*i).first);
-  }
-}
-
-void Timing::clear(void){
-  timers.clear();
-}
-
-void Timing::print(void) const {
-  int rank=0;
-#ifdef KRIPKE_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-#endif
-  if(rank != 0){
-    return;
-  }
-
-  // build a sorted list of names
-  std::vector<std::string> names;
-  for(TimerMap::const_iterator i = timers.begin();i != timers.end();++ i){
-    names.push_back((*i).first);
-
-  }
-  std::sort(names.begin(), names.end());
-
-  std::vector<Timer const *> ord_timers;
-  for(int i = 0;i < names.size();++ i){
-    std::string &name = names[i];
-    TimerMap::const_iterator iter = timers.find(name);
-    ord_timers.push_back(&(*iter).second);
-  }
-
-  // Display column names
-  printf("Timers:\n");
-  printf("  %-16s  %12s  %12s", "Timer", "Count", "Seconds");
-#ifdef KRIPKE_USE_PAPI
-  int num_papi = papi_names.size();
-  for(int i = 0;i < num_papi;++i){
-    printf("  %16s", papi_names[i].c_str());
-  }
-#endif
-  printf("\n");
-
-  // Dislpay timer results
-  for(int i = 0;i < names.size();++ i){
-    printf("  %-16s  %12d  %12.5lf", names[i].c_str(), (int)ord_timers[i]->count, ord_timers[i]->total_time);
-#ifdef KRIPKE_USE_PAPI
-    for(int p = 0;p < num_papi;++ p){
-      printf("  %16ld", (long)ord_timers[i]->papi_total[p]);
-    }
-#endif
-    printf("\n");
-  }
-  
-  // Now display timers in machine readable format
-  printf("\n");
-  printf("TIMER_NAMES:");
-  for(int i = 0;i < names.size();++ i){
-    if(i > 0){
-      printf(",");
-    }
-    printf("%s", names[i].c_str());
-  }
-  printf("\n");
-  printf("TIMER_DATA:");
-  for(int i = 0;i < names.size();++ i){
-    if(i > 0){
-      printf(",");
-    }
-    printf("%lf", ord_timers[i]->total_time);    
-  }
-  printf("\n");
-}
-
-
-double Timing::getTotal(std::string const &name) const{
-  TimerMap::const_iterator i = timers.find(name);
-  if(i == timers.end()){
-    return 0.0;
-  }
-  return (*i).second.total_time;
-}
-
-
-
-void Timing::setPapiEvents(std::vector<std::string> names){
-#ifdef KRIPKE_USE_PAPI
-
-
-  static bool papi_initialized = false;
-  if(!papi_initialized){
-    //printf("PAPI INIT\n");
-    int retval = PAPI_library_init(PAPI_VER_CURRENT);
-    papi_initialized = true;
-
-    if(retval != PAPI_VER_CURRENT){
-      fprintf(stderr, "ERROR INITIALIZING PAPI\n");
-      exit(1);
-    }
-  }
-
-  //printf("PAPI VERSION=%x\n",
-  //    PAPI_VERSION);
-
-  papi_set = PAPI_NULL;
-  PAPI_create_eventset(&papi_set);
-
-
-  for(int i = 0;i < names.size();++ i){
-    // Convert text string to PAPI id
-    int event_code;
-    PAPI_event_name_to_code(
-        const_cast<char*>(names[i].c_str()),
-        &event_code);
-
-    // TODO: error checking?
-
-    // Add to our list of PAPI events
-    papi_names.push_back(names[i]);
-    papi_event.push_back(event_code);
-
-    int retval = PAPI_add_event(papi_set, event_code);
-    if(retval != PAPI_OK){
-      fprintf(stderr, "ERROR ADDING %s, retval=%d, ID=0x%-10x\n", names[i].c_str(), retval, event_code);
-    }
-
-    //printf("EVT=%s, ID=0x%-10x\n", names[i].c_str(), event_code);
-  }
-  PAPI_start(papi_set);
-#else
-  if(names.size() > 0){
-    fprintf(stderr, "WARNING: PAPI NOT ENABLED, IGNORING PAPI EVENTS\n");
-  }
-#endif
-}
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.h b/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.h
deleted file mode 100644
index 101f138bc..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/Kripke/Timing.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * NOTICE
- *
- * This work was produced at the Lawrence Livermore National Laboratory (LLNL)
- * under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S.
- * Department of Energy (DOE) and Lawrence Livermore National Security, LLC
- * (LLNS) for the operation of LLNL. The rights of the Federal Government are
- * reserved under Contract 44.
- *
- * DISCLAIMER
- *
- * This work was prepared as an account of work sponsored by an agency of the
- * United States Government. Neither the United States Government nor Lawrence
- * Livermore National Security, LLC nor any of their employees, makes any
- * warranty, express or implied, or assumes any liability or responsibility
- * for the accuracy, completeness, or usefulness of any information, apparatus,
- * product, or process disclosed, or represents that its use would not infringe
- * privately-owned rights. Reference herein to any specific commercial products,
- * process, or service by trade name, trademark, manufacturer or otherwise does
- * not necessarily constitute or imply its endorsement, recommendation, or
- * favoring by the United States Government or Lawrence Livermore National
- * Security, LLC. The views and opinions of authors expressed herein do not
- * necessarily state or reflect those of the United States Government or
- * Lawrence Livermore National Security, LLC, and shall not be used for
- * advertising or product endorsement purposes.
- *
- * NOTIFICATION OF COMMERCIAL USE
- *
- * Commercialization of this product is prohibited without notifying the
- * Department of Energy (DOE) or Lawrence Livermore National Security.
- */
-
-#ifndef KRIPKE_TIMING_H__
-#define KRIPKE_TIMING_H__
-
-#include <string>
-#include <vector>
-#include <map>
-#include <stdio.h>
-#include <time.h>
-#include <sys/time.h>
-
-#ifdef KRIPKE_USE_PAPI
-#include<papi.h>
-#endif
-
-inline double getTime(void){
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec/1000000.0;
-}
-
-
-struct Timer {
-  Timer() :
-    started(false),
-    start_time(0.0),
-    total_time(0.0),
-    count(0)
-  {}
-
-  bool started;
-  double start_time;
-  double total_time;
-  size_t count;
-#ifdef KRIPKE_USE_PAPI
-  std::vector<long long> papi_start_values;
-  std::vector<size_t> papi_total;
-#endif
-};
-
-class Timing {
-  public:
-    ~Timing();
-
-    void start(std::string const &name);
-    void stop(std::string const &name);
-
-    void stopAll(void);
-    void clear(void);
-
-    void print(void) const;
-    double getTotal(std::string const &name) const;
-
-    void setPapiEvents(std::vector<std::string> names);
-
-  private:
-    typedef std::map<std::string, Timer> TimerMap;
-    TimerMap timers;
-#ifdef KRIPKE_USE_PAPI
-  std::vector<std::string> papi_names;
-  std::vector<int> papi_event;
-  int papi_set;
-#endif
-};
-
-
-#include<stdio.h>
-
-// Aides timing a block of code, with automatic timer stopping
-class BlockTimer {
-  public:
-  inline BlockTimer(Timing &timer_obj, std::string const &timer_name) :
-      timer(timer_obj),
-      name(timer_name)
-  {
-      timer.start(name);
-  }
-  inline ~BlockTimer(){
-    timer.stop(name);
-  }
-
-  private:
-      Timing &timer;
-      std::string name;
-};
-
-#define BLOCK_TIMER(TIMER, NAME) BlockTimer BLK_TIMER_##NAME(TIMER, #NAME);
-
-
-#endif
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/NOTICE.md b/test/Kripke-v1.1/Kripke-v1.1-baseline/NOTICE.md
deleted file mode 100644
index f2dff6f71..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/NOTICE.md
+++ /dev/null
@@ -1,40 +0,0 @@
-LLNL-CODE-658597
-Title: Kripke, Version: 1.1
-Author(s) Adam J. Kunen, etc. all......
-
-
-NOTICE
-======
-
-This work was produced at the Lawrence Livermore National Laboratory (LLNL) 
-under contract no. DE-AC-52-07NA27344 (Contract 44) between the U.S. Department
-of Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the 
-operation of LLNL. The rights of the Federal Government are reserved under 
-Contract 44.
-
-
-DISCLAIMER
-==========
-
-This work was prepared as an account of work sponsored by an agency of the 
-United States Government. Neither the United States Government nor Lawrence 
-Livermore National Security, LLC nor any of their employees, makes any 
-warranty, express or implied, or assumes any liability or responsibility for 
-the accuracy, completeness, or usefulness of any information, apparatus, 
-product, or process disclosed, or represents that its use would not infringe 
-privately-owned rights. Reference herein to any specific commercial products, 
-process, or service by trade name, trademark, manufacturer or otherwise does 
-not necessarily constitute or imply its endorsement, recommendation, or 
-favoring by the United States Government or Lawrence Livermore National 
-Security, LLC. The views and opinions of authors expressed herein do not 
-necessarily state or reflect those of the United States Government or Lawrence 
-Livermore National Security, LLC, and shall not be used for advertising or 
-product endorsement purposes.
-
-
-NOTIFICATION OF COMMERCIAL USE
-==============================
-
-Commercialization of this product is prohibited without notifying the 
-Department of Energy (DOE) or Lawrence Livermore National Security.
-
diff --git a/test/Kripke-v1.1/Kripke-v1.1-baseline/README.md b/test/Kripke-v1.1/Kripke-v1.1-baseline/README.md
deleted file mode 100644
index d72df93e3..000000000
--- a/test/Kripke-v1.1/Kripke-v1.1-baseline/README.md
+++ /dev/null
@@ -1,353 +0,0 @@
-KRIPKE
-======
-
-Version 1.1
-
-Release Date 9/13/2015 
-
-
-Authors
-=======
-  * Adam J. Kunen [kunen1@llnl.gov](mailto:kunen1@llnl.gov) (Primary point of contact)
-  * Peter N. Brown [brown42@llnl.gov](mailto:brown42@llnl.gov)
-  * Teresa S. Bailey [bailey42@llnl.gov](mailto:bailey42@llnl.gov)
-  * Peter G. Maginot [maginot1@llnl.gov](mailto:maginot1@llnl.gov)
-
-
-License
-=======
-See included file NOTICE.md
-
-
-Overview
-========
-Kripke is a simple, scalable, 3D Sn deterministic particle transport code.  Its
-primary purpose is to research how data layout, programming paradigms and 
-architectures effect the implementation and performance of Sn transport.  A 
-main goal of Kripke is investigating how different data-layouts affect 
-instruction, thread and task level parallelism, and what the implications are 
-on overall solver performance.
-
-Kripkie supports storage of angular fluxes (Psi) using all six striding orders 
-(or "nestings") of Directions (D), Groups (G), and Zones (Z), and provides 
-computational kernels specifically written for each of these nestings. Most Sn 
-transport codes are designed around one of these nestings, which is an 
-inflexibility that leads to software engineering compromises when porting to 
-new architectures and programming paradigms.
-
-Early research has found that the problem dimensions (zones, groups, 
-directions, scattering order) and the scaling (number of threads and MPI 
-tasks), can make a profound difference in the performance of each of these 
-nestings. To our knowledge this is a capability unique to Kripke, and should 
-provide key insight into how data-layout effects Sn solver performance. An 
-asynchronous MPI-based parallel sweep algorithm is provided, which employs the 
-concepts of Group Sets (GS) Zone Sets (ZS), and Direction Sets (DS), borrowed 
-from the [Texas A&M code PDT](https://parasol.tamu.edu/asci/).
-
-As we explore new architectures and programming paradigms with Kripke, we will 
-be able to incorporate these findings and ideas into our larger codes. The main
-advantages of using Kripke for this exploration is that it's light-weight (ie. 
-easily refactored and modified), and it gets us closer to the real question we 
-want answered: "What is the best way to layout and implement an Sn code on a 
-given architecture+programming-model?" instead of the more commonly asked 
-question "What is the best way to map my existing Sn code to a given 
-architecture+programming-model?".
-
-
-Mini App or Proxy App?
-----------------------
-Kripke is a Mini-App since it has a very small code base consisting of 4184 
-lines of C++ code (generated using David A. Wheeler's SLOCCount v2.26).
-
-Kripke is also a Proxy-App since it is a proxy for the LLNL transport code 
-ARDRA.
-
-
-Analysis
---------
-A major challenge of achieving high-performance in an Sn transport (or any 
-physics) code is choosing a data-layout and a parallel decomposition that lends
-itself to the targeted architecture. Often the data-layout determines the most 
-efficient nesting of loops in computational kernels, which then determines how 
-well your inner-most-loop SIMDizes, how you add threading (pthreads, OpenMP, 
-etc.), and the efficiency and design of your parallel algorithms. Therefore, 
-each nesting produces different loop nesting orders, which provides 
-substantially different performance characteristics. We want to explore how 
-easily and efficiently these different nestings map to different architectures.
-In particular, we are interested in how we can achieve good parallel efficiency
-while also achieving efficient use of node resources (such as SIMD units, 
-memory systems, and accelerators).
-
-Parallel sweep algorithms can be explored with Kripke in multiple ways. The 
-core MPI algorithm could be modified or rewritten to explore other approaches, 
-domain overloading, or alternate programming models (such as Charm++). The 
-effect of load-imbalance is an understudied aspect of Sn transport sweeps, and 
-could easily be studied with Kripke by artificially adding more work (ie 
-unknowns) to a subset of MPI tasks. Block-AMR could be added to Kripke, which 
-would be a useful way to explore the cost-benefit analysis of adding AMR to an 
-Sn code, and would be a way to further study load imbalances and AMR effects 
-on sweeps.
-
-The coupling of on-node sweep kernel, the parallel sweep algorithm, and the 
-choices of decomposing the problem phase space into GS's, ZS's and DS's impact 
-the performance of the overall sweep. The tradeoff between large and small 
-"units of work" can be studied. Larger "units of work" provide more opportunity
-for on-node parallelism, while creating larger messages, less "sends", and less
-efficient parallel sweeps. Smaller "units of work" make for less efficient 
-on-node kernels, but more efficient parallel sweeps. 
-
-We can also study trading MPI tasks for threads, and the effects this has on 
-our programming models and cache efficiency.
-
-A simple timer infrastructure is provided that measure each compute kernels 
-total time.
-
-
-Physical Models
----------------
-
-Kripke solves the Discrete Ordinance and Diamond Difference discretized 
-steady-state linear Boltzmann equation. 
-
-        H * Psi = (LPlus * S * L) * Psi + Q
-
-Where:
-
-*   **Psi** is the unknown angular flux discretized over zones, directions, 
-    and energy groups
-
-*   **H** is the "streaming-collision" operator.  (Couples zones)
-
-*   **L** is the "discrete-to-moments operator. (Couples directions and moments)
-
-*   **LPlus** is the "moment-to-discrete" operator. 
-    (Couples directions and moments)
-
-*   **S** is the (arbitrary) order scattering operator. (Couples groups)
-
-*   **Q** is an external source. In Kripke it is represented in moment space, 
-    so really "LPlus*Q"
-
-
-Kripke is hard-coded to setup and solve the [3D Kobayashi radiation benchmark, 
-problem 3i](https://www.oecd-nea.org/science/docs/2000/nsc-doc2000-4.pdf).  
-Since Kripke does not have reflecting boundary conditions, the full-space model
-is solved. Command line arguments allow the user to modify the total and 
-scattering cross-sections.  Since Kripke is a multi-group transport code and 
-the Kobayashi problem is single-group, each energy group is setup to solve the 
-same problem with no group-to-group coupling in the data.
-
-
-The steady-state solution method uses the source-iteration technique, where 
-each iteration is as follows:
-
-1.  Phi = LTimes(Psi)
-2.  PhiOut = Scattering(Phi)
-3.  PhiOut = PhiOut + Source()
-4.  Rhs = LPlusTimes(PhiOut)
-5.  Psi = Sweep(Rhs, Psi)  which is solving Psi=(Hinverse * Rhs) a.k.a 
-    _"Inverting H"_
-
-
-
-Building and Running
-====================
-
-Kripke comes with a simple CMake based build system.
-
-Requirements
-------------
-*  CMake 3.0 or later
-*  C++ Compiler (g++, icpc, etc.)
-*  MPI 1.0 or later
-
-
-
-Quick Start
------------
-The easiest way to get Kripke running, is to directly invoke CMake and take 
-whatever system defaults you have for compilers and let CMake find MPI for you.
-
-*  Step 1:  Create a build space (assuming you are starting in the Kripke root 
-   directory)   
-        
-        mkdir build
-
-*  Step 2: Run CMake in that build space
-        
-        cd kripke
-        cmake ..
-
-*  Step 3: Now make Kripke:
-         
-        make -j8
-  
-*  Step 4: Run the test suite to make sure it works
-   
-        make test
-  
-*  Step 5: Run Kripke's default problem:
-   
-        ./kripke
-  
-
-Running Kripke
-==============
-
-Environment Variabes
---------------------
-
-If Kripke is build with OpenMP support, then the environment variables 
-``OMP_NUM_THREADS`` is used to control the number of OpenMP threads.  Kripke 
-does not attempt to modify the OpenMP runtime in anyway, so other ``OMP_*`` 
-environment variables should also work as well.
- 
-
-Command Line Options
---------------------
-Command line option help can also be viewed by running "./kripke --help"
-
-### Problem Size Options:
-
-*   **``--groups <ngroups>``**     
-
-    Number of energy groups. (Default: --groups 32)
-
-*   **``--legendre <lorder>``**    
-
-    Scattering Legendre Expansion Order (0, 1, ...).  (Default: --legendre 4)
-
-*   **``--quad <ndirs>``**, or **``--quad <polar>:<azim>``**
-
-    Define the quadrature set to use either a fake S2 with <ndirs> points, OR 
-		Gauss-Legendre with <polar> by <azim> points.   (Default: --quad 96)
-
-*   **``--zones <x>,<y>,<z>``**
-
-    Number of zones in x,y,z.  (Default: --zones 16,16,16)
-
-
-### Physics Parameters:
-
-*   **``--sigt <sigt0,sigt1,sigt2>``**
- 
-    Total material cross-sections.  (Default:   --sigt 0.1,0.0001,0.1)
-
-*   **``--sigs <sigs0,sigs1,sigs2>``**
- 
-    Total material cross-sections.  (Default:   --sigs 0.05,0.00005,0.05)
-
-
-### On-Node Options:
-
-*   **``--nest <NEST>``**
-
-    Loop nesting order (and data layout), available are DGZ, DZG, GDZ, GZD, 
-		ZDG, and ZGD. (Default: --nest DGZ)
-
-
-###Parallel Decomposition Options:
-
-*   **``--layout <lout>``**        
-    
-    Layout of spatial subdomains over mpi ranks. 0 for "Blocked" where local 
-		zone sets represent adjacent regions of space. 1 for "Scattered" where 
-		adjacent regions of space are distributed to adjacent MPI ranks. 
-		(Default: --layout 0)
-
-*   **--procs <npx,npy,npz>**  
-    
-    Number of MPI ranks in each spatial dimension. (Default:  --procs 1,1,1)
-
-*   **``--dset <ds>``**
-
-    Number of direction-sets.  Must be a factor of 8, and divide evenly the 
-		number of quadrature points. (Default:  --dset 8)
-
-*   **``--gset <gs>``**            
-    
-    Number of energy group-sets.  Must divide evenly the number energy groups. 
-		(Default:  --gset 1)
-
-*   **``--zset <zx>,<zy>,<zz>``**  
-    
-    Number of zone-sets in x, y, and z.  (Default:  --zset 1:1:1)
-
-
-###Solver Options:
-
-*   **``--niter <NITER>``**
-
-    Number of solver iterations to run. (Default:  --niter 10)
-
-*   **``--pmethod <method>``**     
-
-    Parallel solver method. "sweep" for full up-wind sweep (wavefront 
-		algorithm). "bj" for Block Jacobi.  (Default: --pmethod sweep)
-
-
-### Output and Testing Options:
-
-*   **``--test``**                 
-
-    Run Kernel Test instead of solve
-
-*   **``--silo <siloname>``**                 
-
-    Write SILO output (requires building with LLNL's Silo library)
-
-*   **``--papi <PAPI_XXX_XXX,...>``**
-
-    Track PAPI hardware counters for each timer. (requires building with 
-		PAPI library)
-    
-
-Test Suite
-----------
-
-Running with the ``--test`` command line argument will run a unit-testing frame
-work that will compare each kernel, using random input data, with the same 
-kernel from a different nesting.  This is very useful for checking correctness 
-of kernels after modification.
-
-Running ``make test`` will use the CMake testing framework, CTest, to run a 
-series of tests outlined in the root ``CMakeLists.txt`` file.
-
-
-Future Plans
-============
-
-Some ideas for future study:
-
-*   Block AMR.
-
-*   More FLOP intensive spatial discretizations such as DFEM's.
-
-*   Programming model abstractions
-
-
-Retirement
-==========
-
-Retirement of this Mini-App should be considered when it is no longer a 
-representative of state-of-the-art transport codes, or when it becomes too 
-cumbersome to adapt to advanced architectures. Also, at the point of 
-retirement it should be clear how to design its successor.
-
-
-Publications, Presentations, Links
-==================================
-
-*  [LLNL Codesign Website](https://codesign.llnl.gov/index.php)
-
-*  A. J. Kunen, T. S. Bailey, P. N. Brown, [KRIPKE- A Massively Parallel Transport Mini-App](https://codesign.llnl.gov/pdfs/Kripke_ANS_2015_Paper.pdf) American Nuclear Society M&C 2015,  April 21, 2015 (LLNL-CONF-675389)
-
-*  A. J. Kunen, [RAJA-Like Transformations in Kripke](https://codesign.llnl.gov/pdfs/TLoops.pdf), February 5, 2015 (LLNL-PRES-666686)
-
-*  A. J. Kunen,  [An Sn Transport Mini App](https://codesign.llnl.gov/pdfs/Kripke_Present.pdf), October 22, 2014 (LLNL-PRES-661866)
-
-
-
-Release
-=======
-LLNL-CODE-658597
diff --git a/test/LULESH-v1.0/CMakeLists.txt b/test/LULESH-v1.0/CMakeLists.txt
deleted file mode 100644
index 62a7b45bc..000000000
--- a/test/LULESH-v1.0/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_subdirectory(LULESH-v1.0_baseline)
-add_subdirectory(LULESH-v1.0_RAJA-variants)
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/CMakeLists.txt b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/CMakeLists.txt
deleted file mode 100644
index ee0a0442d..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-
-if (RAJA_ENABLE_CUDA)
-  add_definitions(-DUSE_CASE=9)
-  cuda_add_executable(lulesh-RAJA-parallel.exe
-    luleshRAJA-parallel.cxx)
-else()
-  if(RAJA_ENABLE_OPENMP)
-    add_executable(lulesh-RAJA-parallel.exe
-      luleshRAJA-parallel.cxx)
-  else()
-    add_definitions(-DUSE_CASE=1)
-    add_executable(lulesh-RAJA-parallel.exe
-      luleshRAJA-parallel.cxx)
-  endif()
-
-add_executable(lulesh-RAJA-serial.exe
-  luleshRAJA-serial.cxx)
-
-target_link_libraries(lulesh-RAJA-serial.exe
- RAJA
- ${RT_LIBRARIES})
-
-endif()
-
-target_link_libraries(lulesh-RAJA-parallel.exe
-  RAJA
-  ${RT_LIBRARIES})
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshMemory.hxx b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshMemory.hxx
deleted file mode 100644
index a4cb54366..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshMemory.hxx
+++ /dev/null
@@ -1,178 +0,0 @@
-// This work was performed under the auspices of the U.S. Department of Energy by
-// Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
-//
-
-//
-// ALLOCATE/RELEASE FUNCTIONS 
-//
-
-#if defined(RAJA_ENABLE_CUDA) // CUDA managed memory allocate/release
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   T *retVal ;
-   cudaErrchk( cudaMallocManaged((void **)&retVal, sizeof(T)*size, cudaMemAttachGlobal) ) ;
-   return retVal ;
-}
-
-template <typename EXEC_POLICY_T, typename T>
-inline T *AllocateTouch(RAJA::IndexSet *is, size_t size)
-{
-   T *retVal ;
-   cudaErrchk( cudaMallocManaged((void **)&retVal, sizeof(T)*size, cudaMemAttachGlobal) ) ;
-   cudaMemset(retVal,0,sizeof(T)*size);
-   return retVal ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      cudaErrchk( cudaFree(*ptr) ) ;
-      *ptr = NULL ;
-   }
-}
-
-
-#else  // Standard CPU memory allocate/release
-
-#include <cstdlib>
-#include <cstring>
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   T *retVal ;
-   posix_memalign((void **)&retVal, RAJA::DATA_ALIGN, sizeof(T)*size);
-// memset(retVal,0,sizeof(T)*size);
-   return retVal ;
-}
-
-template <typename EXEC_POLICY_T, typename T>
-inline T *AllocateTouch(RAJA::IndexSet *is, size_t size)
-{
-   T *retVal ;
-   posix_memalign((void **)&retVal, RAJA::DATA_ALIGN, sizeof(T)*size);
-
-   /* we should specialize by policy type here */
-   RAJA::forall<EXEC_POLICY_T>( *is, [=] RAJA_DEVICE (int i) {
-      retVal[i] = 0 ;
-   } ) ;
-
-   return retVal ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-#endif 
-
-
-/**********************************/
-/* Memory Pool                    */
-/**********************************/
-
-namespace RAJA {
-
-template <typename VARTYPE >
-struct MemoryPool {
-public:
-   MemoryPool()
-   {
-      for (int i=0; i<32; ++i) {
-         lenType[i] = 0 ;
-         ptr[i] = 0 ;
-      }
-   }
-
-   VARTYPE *allocate(int len) {
-      VARTYPE *retVal ;
-      int i ;
-      for (i=0; i<32; ++i) {
-         if (lenType[i] == len) {
-            lenType[i] = -lenType[i] ;
-            retVal = ptr[i] ;
-#if 0
-            /* migrate smallest lengths to be first in list */
-            /* since longer lengths can amortize lookup cost */
-            if (i > 0) {
-               if (len < abs(lenType[i-1])) {
-                  lenType[i] = lenType[i-1] ;
-                  ptr[i] = ptr[i-1] ;
-                  lenType[i-1] = -len ;
-                  ptr[i-1] = retVal ;
-               }
-            }
-#endif
-            break ;
-         }
-         else if (lenType[i] == 0) {
-            lenType[i] = -len ;
-            ptr[i] = Allocate<VARTYPE>(len) ;
-            retVal = ptr[i] ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         retVal = 0 ;  /* past max available pointers */
-      }
-      return retVal ;
-   }
-
-   bool release(VARTYPE **oldPtr) {
-      int i ;
-      bool success = true ;
-      for (i=0; i<32; ++i) {
-         if (ptr[i] == *oldPtr) {
-            lenType[i] = -lenType[i] ;
-            *oldPtr = 0 ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         success = false ; /* error -- not found */
-      }
-      return success ;
-   }
-
-   bool release(VARTYPE * __restrict__ *oldPtr) {
-      int i ;
-      bool success = true ;
-      for (i=0; i<32; ++i) {
-         if (ptr[i] == *oldPtr) {
-            lenType[i] = -lenType[i] ;
-            *oldPtr = 0 ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         success = false ; /* error -- not found */
-      }
-      return success ; 
-   }
-
-   VARTYPE *ptr[32] ; 
-   int lenType[32] ;
-} ;
-
-}
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshPolicy.hxx b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshPolicy.hxx
deleted file mode 100644
index 0628d8e5a..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshPolicy.hxx
+++ /dev/null
@@ -1,252 +0,0 @@
-// This work was performed under the auspices of the U.S. Department of Energy by
-// Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
-//
-
-//
-//   Tiling modes for different exeuction cases (see luleshPolicy.hxx).
-//
-enum TilingMode
-{
-   Canonical,       // canonical element ordering -- single range segment
-   Tiled_Index,     // canonical ordering, tiled using unstructured segments
-   Tiled_Order,     // elements permuted, tiled using range segments
-   Tiled_LockFree,  // tiled ordering, lock-free
-   Tiled_LockFreeColor,     // tiled ordering, lock-free, unstructured
-   Tiled_LockFreeColorSIMD  // tiled ordering, lock-free, range
-};
-
-
-// Use cases for RAJA execution patterns:
-
-#define LULESH_SEQUENTIAL       1 /* (possible SIMD vectorization applied) */
-#define LULESH_CANONICAL        2 /*  OMP forall applied to each for loop */
-#define LULESH_TILE_INDEXED     3 /*  OMP Tiles defined by unstructured */
-                                  //  Indexset Segment partitioning.
-                                  //  One tile per segment.
-#define LULESH_TILE_ORDERED     4 /*  OMP The mesh is permuted so a tile */
-                                  //  is defined as a contiguous chunk
-                                  //  of the iteration space. Tile per thread.
-#define LULESH_TILE_TASK        5 /*  OMP Mesh chunked like Canonical, but */
-                                  //  now chunks are dependency scheduled,
-                                  //  reducing the need for lock constructs
-#define LULESH_TILE_COLOR       6 /*  OMP Analogous to Tile_Indexed, but */
-                                  //  individual array values are 
-                                  //  'checker-boarded' into 'colors' to
-                                  //  guarantee indpenedent data access as
-                                  //  long as each 'color' of array values
-                                  //  completes before executing the next color
-#define LULESH_TILE_COLOR_SIMD  7 /*  Colored like USE_CASE 6, but colors */
-                                  //  are permuted to be contiguous chunks,
-                                  //  like LULESH_TILED_ORDERED
-#define LULESH_CILK             8 /*  cilk_for applied to each loop */
-#define LULESH_CUDA_CANONICAL   9 /*  CUDA launch applied to each loop */
-#define LULESH_CUDA_COLOR_SIMD 10 /*  Technique 7 on GPU to avoid */
-                                  //  OMP_FINE_SYNC data movement.
-
-#ifndef USE_CASE
-#define USE_CASE   LULESH_TILE_TASK
-#endif
-
-// ----------------------------------------------------
-#if USE_CASE == LULESH_SEQUENTIAL 
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit              Hybrid_Seg_Iter;
-typedef RAJA::simd_exec              Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::seq_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CANONICAL
-
-// Requires OMP_FINE_SYNC when run in parallel
-#define OMP_FINE_SYNC 1
-
-// AllocateTouch should definitely be used
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit              Hybrid_Seg_Iter;
-typedef RAJA::omp_parallel_for_exec  Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy;
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_TILE_INDEXED
-
-// Currently requires OMP_FINE_SYNC when run in parallel
-#define OMP_FINE_SYNC 1
-
-// Only use AllocateTouch if tiling is imposed on top of a block decomposition,
-// and that block decomposition is the indexset used for the first touch (see CreateMaskedIndexSet)
-
-TilingMode const lulesh_tiling_mode = Tiled_Index;
-
-typedef RAJA::omp_parallel_for_segit  Hybrid_Seg_Iter;
-typedef RAJA::simd_exec               Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_TILE_ORDERED
-
-// Currently requires OMP_FINE_SYNC when run in parallel
-#define OMP_FINE_SYNC 1
-
-// AllocateTouch should definitely be used
-
-TilingMode const lulesh_tiling_mode = Tiled_Order;
-
-typedef RAJA::omp_parallel_for_segit  Hybrid_Seg_Iter;
-typedef RAJA::simd_exec               Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_parallel_for_segit, RAJA::simd_exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_TILE_TASK
-
-// Can be used with or without OMP_FINE_SYNC; without will have less data movement and memory use
-
-// AllocateTouch should definitely be used
-
-// In reality, only the "lock-free" operations need to use the dependence graph embedded in the
-// lock-free indexset, and the dependence-graph should likely be deactivated for other operations.
-
-TilingMode const lulesh_tiling_mode = Tiled_LockFree;
-
-typedef RAJA::omp_parallel_for_segit  Hybrid_Seg_Iter;
-typedef RAJA::simd_exec               Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_taskgraph_segit, RAJA::simd_exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::omp_taskgraph_segit, RAJA::simd_exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<RAJA::seq_segit, RAJA::omp_parallel_for_exec>  symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_TILE_COLOR
-
-// Can be used with or without OMP_FINE_SYNC; without will have less data movement and memory use
-
-// AlocateTouch use is very tricky with this lockfree indexset.
-
-TilingMode const lulesh_tiling_mode = Tiled_LockFreeColor;
-
-typedef RAJA::seq_segit              Hybrid_Seg_Iter;
-typedef RAJA::omp_parallel_for_exec  Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, RAJA::omp_parallel_for_exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, RAJA::omp_parallel_for_exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, RAJA::omp_parallel_for_exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, RAJA::omp_parallel_for_exec> symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_TILE_COLOR_SIMD
-
-// Can be used with or without OMP_FINE_SYNC; without will have less data movement and memory use
-
-// AlocateTouch use is very tricky with this lockfree indexset.
-
-TilingMode const lulesh_tiling_mode = Tiled_LockFreeColorSIMD;
-
-typedef RAJA::seq_segit              Hybrid_Seg_Iter;
-typedef RAJA::omp_parallel_for_exec  Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CILK
-
-// Requires OMP_FINE_SYNC when run in parallel
-#define OMP_FINE_SYNC 1
-
-// AllocateTouch should definitely be used
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::cilk_for_segit         Hybrid_Seg_Iter;
-typedef RAJA::cilk_for_exec          Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::cilk_reduce            reduce_policy ;
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CUDA_CANONICAL
-
-// Requires OMP_FINE_SYNC 
-#define OMP_FINE_SYNC 1
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit         Hybrid_Seg_Iter;
-
-/// Define thread block size for CUDA exec policy
-const size_t thread_block_size = 256;
-typedef RAJA::cuda_exec<thread_block_size>    Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::cuda_reduce<thread_block_size> reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CUDA_COLOR_SIMD
-
-// Can be used with or without OMP_FINE_SYNC; without will have less data movement and memory use
-
-TilingMode const lulesh_tiling_mode = Tiled_LockFreeColorSIMD;
-
-typedef RAJA::seq_segit         Hybrid_Seg_Iter;
-
-/// Define thread block size for CUDA exec policy
-const size_t thread_block_size = 256;
-typedef RAJA::cuda_exec<thread_block_size>    Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Hybrid_Seg_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::cuda_reduce<thread_block_size> reduce_policy; 
-
-#else
-
-#error "You must define a use case in luleshPolicy.cxx"
-
-#endif
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel-FT.cxx b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel-FT.cxx
deleted file mode 100644
index 30304d160..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel-FT.cxx
+++ /dev/null
@@ -1,3383 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cctype>
-
-#include "RAJA/RAJA.hxx"
-
-#include "Timer.hxx"
-
-/*
- ***********************************************
- * Set parameters that define how code will run.
- ***********************************************
- */
-
-//
-// Display simulation time and timestep during run.
-//
-bool show_run_progress = false;
-
-//
-// Set stop time and time increment for run.
-//
-// The absolute value of lulesh_time_step sets the first time step increment.
-//   - If < 0, the CFL condition will be used to determine subsequent time
-//     step sizes (with some upper bound on the amount the timestep can grow).
-//   - If > 0, the time step will be fixed for the entire run.
-//
-const double lulesh_stop_time = 1.0e-2;
-const double lulesh_time_step = -1.0e-7;
-
-//
-// Set mesh size (physical domain size is fixed).
-//
-// Mesh will be lulesh_edge_elems^3.
-//
-const int lulesh_edge_elems = 45;
-
-
-//
-//   Tiling mode.
-//
-enum TilingMode
-{
-   Canonical,       // canonical element ordering -- single range segment
-   Tiled_Index,     // canonical ordering, tiled using unstructured segments
-   Tiled_Order,     // elements permuted, tiled using range segments
-   Tiled_LockFree,  // tiled ordering, lock-free
-};
-TilingMode lulesh_tiling_mode = Canonical;
-//TilingMode lulesh_tiling_mode = Tiled_Index;
-//TilingMode lulesh_tiling_mode = Tiled_Order;
-//TilingMode lulesh_tiling_mode = Tiled_LockFree;
-
-//
-// Set number of tiles in each mesh direction for non-canonical oerderings.
-//
-const int lulesh_xtile = 2;
-const int lulesh_ytile = 2;
-const int lulesh_ztile = 2;
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-//   Need to verify if this can be set to RangeSegment or ListSegment
-//   types. It may be useful to compare IndexSet performance to
-//   basic segment types; e.g.,
-//
-//     - Canonical ordering should be able to use IndexSet or
-//                                                RangeSegment.
-//     - Tiled_Index ordering should be able to use IndexSet or
-//                                                  ListSegment.
-//
-//   Policies for index set segment iteration and segment execution.
-//
-//   NOTE: Currently, we apply single policy across all loop patterns.
-//
-typedef RAJA::seq_segit              IndexSet_Seg_Iter;
-//typedef RAJA::omp_parallel_for_segit IndexSet_Seg_Iter;
-//typedef RAJA::cilk_for_segit         IndexSet_Seg_Iter;
-
-//typedef RAJA::seq_exec              Segment_Exec;
-//typedef RAJA::simd_exec             Segment_Exec;
-typedef RAJA::omp_parallel_for_exec Segment_Exec;
-//typedef RAJA::cilk_for_exec         Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> minloc_exec_policy;
-typedef                                            Segment_Exec  range_exec_policy;
-
-typedef                                            RAJA::omp_reduce  reduce_policy;
-
-//
-// use RAJA data types for loop operations using RAJA
-//
-typedef RAJA::Index_type  Index_t ; /* array subscript and loop index */
-typedef RAJA::Real_type   Real_t ;  /* floating point representation */
-typedef RAJA::Real_ptr    Real_p;
-typedef RAJA::const_Real_ptr    const_Real_p;
-typedef RAJA::Index_type* Index_p;
-
-/****************************************************/
-/*                                                  */
-/* Allow flexibility for arithmetic representations */
-/*                                                  */
-/* Think about how to make this consistent w/RAJA   */
-/* type parameterization (above)!!                  */
-/*                                                  */
-/****************************************************/
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Int_t ;   /* integer representation */
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-#define RAJA_STORAGE static inline
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-
-#ifdef RAJA_ENABLE_FT
-#include <unistd.h>
-#include <signal.h>
-
-/* fault_type:   == 0 no fault, < 0 unrecoverable, > 0 recoverable */
-namespace RAJA {
-volatile int fault_type = 0 ;
-}
-
-static struct sigaction sigalrmact ;
-
-static void simulate_fault(int sig)
-{
-   /* 10% chance of unrecoverable fault */
-   RAJA::fault_type = (rand() % 100) - 10 ;
-}
-#endif
-
-/*********************************/
-/* Data structure implementation */
-/*********************************/
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-struct Domain {
-   /* Elem-centered */
-
-   RAJA::IndexSet *domElemList ;   /* elem indexset */
-   RAJA::IndexSet *matElemList ;   /* material indexset */
-   Index_p nodelist ;     /* elemToNode connectivity */
-
-   Index_p lxim ;         /* elem connectivity through face */
-   Index_p lxip ;
-   Index_p letam ;
-   Index_p letap ;
-   Index_p lzetam ;
-   Index_p lzetap ;
-
-   Int_t *elemBC ;         /* elem face symm/free-surface flag */
-
-   Real_p e ;             /* energy */
-
-   Real_p p ;             /* pressure */
-
-   Real_p q ;             /* q */
-   Real_p ql ;            /* linear term for q */
-   Real_p qq ;            /* quadratic term for q */
-
-   Real_p v ;             /* relative volume */
-
-   Real_p volo ;          /* reference volume */
-   Real_p delv ;          /* m_vnew - m_v */
-   Real_p vdov ;          /* volume derivative over volume */
-
-   Real_p arealg ;        /* elem characteristic length */
-
-   Real_p ss ;            /* "sound speed" */
-
-   Real_p elemMass ;      /* mass */
-
-   /* Elem temporaries */
-
-   Real_p vnew ;          /* new relative volume -- temporary */
-
-   Real_p delv_xi ;       /* velocity gradient -- temporary */
-   Real_p delv_eta ;
-   Real_p delv_zeta ;
-
-   Real_p delx_xi ;       /* position gradient -- temporary */
-   Real_p delx_eta ;
-   Real_p delx_zeta ;
-
-   Real_p dxx ;          /* principal strains -- temporary */
-   Real_p dyy ;
-   Real_p dzz ;
-
-   /* Node-centered */
-
-   RAJA::IndexSet *domNodeList ;   /* node indexset */
-
-   Real_p x ;             /* coordinates */
-   Real_p y ;
-   Real_p z ;
-
-   Real_p xd ;            /* velocities */
-   Real_p yd ;
-   Real_p zd ;
-
-   Real_p xdd ;           /* accelerations */
-   Real_p ydd ;
-   Real_p zdd ;
-
-   Real_p fx ;            /* forces */
-   Real_p fy ;
-   Real_p fz ;
-
-   Real_p nodalMass ;     /* mass */
-
-   // OMP hack 
-   Index_p nodeElemStart ;
-   Index_p nodeElemCornerList ;
-
-   /* Boundary nodesets */
-
-   Index_p symmX ;        /* Nodes on X symmetry plane */
-   Index_p symmY ;        /* Nodes on Y symmetry plane */
-   Index_p symmZ ;        /* Nodes on Z symmetry plane */
-
-   /* Parameters */
-
-   Real_t  dtfixed ;           /* fixed time increment */
-   Real_t  time ;              /* current time */
-   Real_t  deltatime ;         /* variable time increment */
-   Real_t  deltatimemultlb ;
-   Real_t  deltatimemultub ;
-   Real_t  stoptime ;          /* end time for simulation */
-
-   Real_t  u_cut ;             /* velocity tolerance */
-   Real_t  hgcoef ;            /* hourglass control */
-   Real_t  qstop ;             /* excessive q indicator */
-   Real_t  monoq_max_slope ;
-   Real_t  monoq_limiter_mult ;
-   Real_t  e_cut ;             /* energy tolerance */
-   Real_t  p_cut ;             /* pressure tolerance */
-   Real_t  ss4o3 ;
-   Real_t  q_cut ;             /* q tolerance */
-   Real_t  v_cut ;             /* relative volume tolerance */
-   Real_t  qlc_monoq ;         /* linear term coef for q */
-   Real_t  qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  qqc ;
-   Real_t  eosvmax ;
-   Real_t  eosvmin ;
-   Real_t  pmin ;              /* pressure floor */
-   Real_t  emin ;              /* energy floor */
-   Real_t  dvovmax ;           /* maximum allowable volume change */
-   Real_t  refdens ;           /* reference density */
-
-   Real_t  dtcourant ;         /* courant constraint */
-   Real_t  dthydro ;           /* volume change constraint */
-   Real_t  dtmax ;             /* maximum allowable time increment */
-
-   Int_t   cycle ;             /* iteration count for simulation */
-
-   Index_t sizeX ;
-   Index_t sizeY ;
-   Index_t sizeZ ;
-   Index_t numElem ;
-
-   Index_t numNode ;
-} ;
-
-// ########################################################
-//  Memory allocate/release routines
-// ########################################################
-#include "luleshMemory.hxx"
-
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-RAJA_STORAGE
-void TimeIncrement(Domain *domain)
-{
-   Real_t targetdt = domain->stoptime - domain->time ;
-
-   if ((domain->dtfixed <= Real_t(0.0)) && (domain->cycle != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain->deltatime ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (domain->dtcourant < newdt) {
-         newdt = domain->dtcourant / Real_t(2.0) ;
-      }
-      if (domain->dthydro < newdt) {
-         newdt = domain->dthydro * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain->deltatimemultlb) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain->deltatimemultub) {
-            newdt = olddt*domain->deltatimemultub ;
-         }
-      }
-
-      if (newdt > domain->dtmax) {
-         newdt = domain->dtmax ;
-      }
-      domain->deltatime = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain->deltatime) &&
-       (targetdt < (Real_t(4.0) * domain->deltatime / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain->deltatime / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain->deltatime) {
-      domain->deltatime = targetdt ;
-   }
-
-   domain->time += domain->deltatime ;
-
-   ++domain->cycle ;
-}
-
-RAJA_STORAGE
-void InitStressTermsForElems(Real_p p, Real_p q,
-                             Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                             RAJA::IndexSet *domElemList)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int idx) {
-      sigxx[idx] = sigyy[idx] = sigzz[idx] =  - p[idx] - q[idx] ;
-     }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcElemShapeFunctionDerivatives( const_Real_p x,
-                                       const_Real_p y,
-                                       const_Real_p z,
-                                       Real_t b[][8],
-                                       Real_t* const volume
-                                     )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-RAJA_STORAGE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-RAJA_STORAGE
-void CalcElemNodeNormals(
-                         Real_p pfx,
-                         Real_p pfy,
-                         Real_p pfz,
-                         const_Real_p x,
-                         const_Real_p y,
-                         const_Real_p z
-                        )
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-RAJA_STORAGE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_p fx, Real_p fy, Real_p fz
-                                )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-RAJA_STORAGE
-void IntegrateStressForElems( Index_t numElem, Index_p nodelist,
-                              Real_p x,  Real_p y,  Real_p z,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                              Real_p determ, Index_p nodeElemStart,
-                              Index_p nodeElemCornerList,
-                              RAJA::IndexSet *domElemList,
-                              RAJA::IndexSet *domNodeList )
-{
-  Real_p fx_elem = Allocate<Real_t>(numElem*8) ;
-  Real_p fy_elem = Allocate<Real_t>(numElem*8) ;
-  Real_p fz_elem = Allocate<Real_t>(numElem*8) ;
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int k) {
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    const Index_p elemNodes = &nodelist[8*k];
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                     B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                         x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                 &fx_elem[k*8], &fy_elem[k*8], &fz_elem[k*8]) ;
-   }
-  ) ;
-
-  RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int gnode) {
-     Index_t count = nodeElemStart[gnode+1] - nodeElemStart[gnode] ;
-     Index_t *cornerList = &nodeElemCornerList[nodeElemStart[gnode]] ;
-     Real_t fx_sum = Real_t(0.0) ;
-     Real_t fy_sum = Real_t(0.0) ;
-     Real_t fz_sum = Real_t(0.0) ;
-     for (Index_t i=0 ; i < count ; ++i) {
-        Index_t elem = cornerList[i] ;
-        fx_sum += fx_elem[elem] ;
-        fy_sum += fy_elem[elem] ;
-        fz_sum += fz_elem[elem] ;
-     }
-     fx[gnode] = fx_sum ;
-     fy[gnode] = fy_sum ;
-     fz[gnode] = fz_sum ;
-   }
-  ) ;
-
-  Release(&fz_elem) ;
-  Release(&fy_elem) ;
-  Release(&fx_elem) ;
-}
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Real_p x, Real_p y, Real_p z,
-                                   Index_p elemToNode,
-                                   Real_p elemX,
-                                   Real_p elemY,
-                                   Real_p elemZ
-                                  )
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = x[nd0i];
-   elemX[1] = x[nd1i];
-   elemX[2] = x[nd2i];
-   elemX[3] = x[nd3i];
-   elemX[4] = x[nd4i];
-   elemX[5] = x[nd5i];
-   elemX[6] = x[nd6i];
-   elemX[7] = x[nd7i];
-
-   elemY[0] = y[nd0i];
-   elemY[1] = y[nd1i];
-   elemY[2] = y[nd2i];
-   elemY[3] = y[nd3i];
-   elemY[4] = y[nd4i];
-   elemY[5] = y[nd5i];
-   elemY[6] = y[nd6i];
-   elemY[7] = y[nd7i];
-
-   elemZ[0] = z[nd0i];
-   elemZ[1] = z[nd1i];
-   elemZ[2] = z[nd2i];
-   elemZ[3] = z[nd3i];
-   elemZ[4] = z[nd4i];
-   elemZ[5] = z[nd5i];
-   elemZ[6] = z[nd6i];
-   elemZ[7] = z[nd7i];
-
-}
-
-RAJA_STORAGE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-RAJA_STORAGE
-void CalcElemVolumeDerivative(
-                              Real_p dvdx,
-                              Real_p dvdy,
-                              Real_p dvdz,
-                              const_Real_p x,
-                              const_Real_p y,
-                              const_Real_p z
-                             )
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-RAJA_STORAGE
-void CalcElemFBHourglassForce(
-                              Real_p xd, Real_p yd, Real_p zd,
-                              Real_p hourgam0, Real_p hourgam1,
-                              Real_p hourgam2, Real_p hourgam3,
-                              Real_p hourgam4, Real_p hourgam5,
-                              Real_p hourgam6, Real_p hourgam7,
-                              Real_t coefficient,
-                              Real_p hgfx, Real_p hgfy, Real_p hgfz
-                             )
-{
-   const Index_t i00=0;
-   const Index_t i01=1;
-   const Index_t i02=2;
-   const Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-const Real_t ggamma[4][8] =
-{
-   { Real_t( 1.), Real_t( 1.), Real_t(-1.), Real_t(-1.),
-     Real_t(-1.), Real_t(-1.), Real_t( 1.), Real_t( 1.) },
-
-   { Real_t( 1.), Real_t(-1.), Real_t(-1.), Real_t( 1.),
-     Real_t(-1.), Real_t( 1.), Real_t( 1.), Real_t(-1.) },
-
-   { Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.),
-     Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) },
-
-   { Real_t(-1.), Real_t( 1.), Real_t(-1.), Real_t( 1.),
-     Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) }
-
-} ;
-
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Index_t numElem, Index_t numNode,
-                                   Index_p nodelist,
-                                   Real_p  ss, Real_p  elemMass,
-                                   Real_p  xd, Real_p  yd, Real_p  zd,
-                                   Real_p  fx, Real_p  fy, Real_p  fz,
-                                   Real_p  determ,
-                                   Real_p  x8n, Real_p  y8n, Real_p  z8n,
-                                   Real_p  dvdx, Real_p  dvdy, Real_p  dvdz,
-                                   Real_t hourg, Index_p nodeElemStart,
-                                   Index_p nodeElemCornerList,
-                                   RAJA::IndexSet *domElemList,
-                                   RAJA::IndexSet *domNodeList)
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-   Real_p fx_elem = Allocate<Real_t>(numElem*8) ;
-   Real_p fy_elem = Allocate<Real_t>(numElem*8) ;
-   Real_p fz_elem = Allocate<Real_t>(numElem*8) ;
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int i2) {
-      Real_t coefficient;
-
-      Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-      Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      Index_p elemToNode = &nodelist[8*i2];
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * ggamma[i1][0] + x8n[i3+1] * ggamma[i1][1] +
-            x8n[i3+2] * ggamma[i1][2] + x8n[i3+3] * ggamma[i1][3] +
-            x8n[i3+4] * ggamma[i1][4] + x8n[i3+5] * ggamma[i1][5] +
-            x8n[i3+6] * ggamma[i1][6] + x8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * ggamma[i1][0] + y8n[i3+1] * ggamma[i1][1] +
-            y8n[i3+2] * ggamma[i1][2] + y8n[i3+3] * ggamma[i1][3] +
-            y8n[i3+4] * ggamma[i1][4] + y8n[i3+5] * ggamma[i1][5] +
-            y8n[i3+6] * ggamma[i1][6] + y8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * ggamma[i1][0] + z8n[i3+1] * ggamma[i1][1] +
-            z8n[i3+2] * ggamma[i1][2] + z8n[i3+3] * ggamma[i1][3] +
-            z8n[i3+4] * ggamma[i1][4] + z8n[i3+5] * ggamma[i1][5] +
-            z8n[i3+6] * ggamma[i1][6] + z8n[i3+7] * ggamma[i1][7];
-
-         hourgam0[i1] = ggamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = ggamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = ggamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = ggamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = ggamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = ggamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = ggamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = ggamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=ss[i2];
-      mass1=elemMass[i2];
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = xd[n0si2];
-      xd1[1] = xd[n1si2];
-      xd1[2] = xd[n2si2];
-      xd1[3] = xd[n3si2];
-      xd1[4] = xd[n4si2];
-      xd1[5] = xd[n5si2];
-      xd1[6] = xd[n6si2];
-      xd1[7] = xd[n7si2];
-
-      yd1[0] = yd[n0si2];
-      yd1[1] = yd[n1si2];
-      yd1[2] = yd[n2si2];
-      yd1[3] = yd[n3si2];
-      yd1[4] = yd[n4si2];
-      yd1[5] = yd[n5si2];
-      yd1[6] = yd[n6si2];
-      yd1[7] = yd[n7si2];
-
-      zd1[0] = zd[n0si2];
-      zd1[1] = zd[n1si2];
-      zd1[2] = zd[n2si2];
-      zd1[3] = zd[n3si2];
-      zd1[4] = zd[n4si2];
-      zd1[5] = zd[n5si2];
-      zd1[6] = zd[n6si2];
-      zd1[7] = zd[n7si2];
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7, coefficient,
-                      &fx_elem[i3], &fy_elem[i3], &fz_elem[i3] );
-    }
-   ) ; 
-
-   /* added tmp arrays for fault tolerance */
-   Real_p fx_tmp  = Allocate<Real_t>(numNode) ;
-   Real_p fy_tmp  = Allocate<Real_t>(numNode) ;
-   Real_p fz_tmp  = Allocate<Real_t>(numNode) ;
-
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int gnode) {
-      fx_tmp[gnode] = fx[gnode] ;
-      fy_tmp[gnode] = fy[gnode] ;
-      fz_tmp[gnode] = fz[gnode] ;
-    }
-   ) ;
-
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int gnode) {
-      Index_t count = nodeElemStart[gnode+1] - nodeElemStart[gnode] ;
-      Index_t *cornerList = &nodeElemCornerList[nodeElemStart[gnode]] ;
-      Real_t fx_sum = Real_t(0.0) ;
-      Real_t fy_sum = Real_t(0.0) ;
-      Real_t fz_sum = Real_t(0.0) ;
-      for (Index_t i=0 ; i < count ; ++i) {
-         Index_t elem = cornerList[i] ;
-         fx_sum += fx_elem[elem] ;
-         fy_sum += fy_elem[elem] ;
-         fz_sum += fz_elem[elem] ;
-      }
-      fx[gnode] = fx_tmp[gnode] + fx_sum ;
-      fy[gnode] = fy_tmp[gnode] + fy_sum ;
-      fz[gnode] = fz_tmp[gnode] + fz_sum ;
-    }
-   ) ;
-
-   Release(&fz_tmp) ;
-   Release(&fy_tmp) ;
-   Release(&fx_tmp) ;
-
-   Release(&fz_elem) ;
-   Release(&fy_elem) ;
-   Release(&fx_elem) ;
-}
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain *domain,
-                                  Real_p determ,
-                                  Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_p dvdx = Allocate<Real_t>(numElem8) ;
-   Real_p dvdy = Allocate<Real_t>(numElem8) ;
-   Real_p dvdz = Allocate<Real_t>(numElem8) ;
-   Real_p x8n  = Allocate<Real_t>(numElem8) ;
-   Real_p y8n  = Allocate<Real_t>(numElem8) ;
-   Real_p z8n  = Allocate<Real_t>(numElem8) ;
-
-   // For negative element volume check
-   RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] RAJA_DEVICE (int idx) {
-
-      Index_p elemToNode = &domain->nodelist[8*idx];
-      CollectDomainNodesToElemNodes(domain->x, domain->y, domain->z, elemToNode,
-                                    &x8n[8*idx], &y8n[8*idx], &z8n[8*idx] );
-
-      CalcElemVolumeDerivative(&dvdx[8*idx], &dvdy[8*idx], &dvdz[8*idx],
-                               & x8n[8*idx], & y8n[8*idx], & z8n[8*idx]);
-
-      determ[idx] = domain->volo[idx] * domain->v[idx];
-
-      determ[idx] = domain->volo[idx] * domain->v[idx];
-
-      minvol.min(domain->v[idx]);
-
-    }
-   ) ;
-
-   if ( Real_t(minvol) <= Real_t(0.0) ) {
-      exit(VolumeError) ;
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( numElem, domain->numNode,
-                                    domain->nodelist,
-                                    domain->ss, domain->elemMass,
-                                    domain->xd, domain->yd, domain->zd,
-                                    domain->fx, domain->fy, domain->fz,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, domain->nodeElemStart,
-                                    domain->nodeElemCornerList,
-                                    domain->domElemList, domain->domNodeList) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef ;
-      Real_p sigxx  = Allocate<Real_t>(numElem) ;
-      Real_p sigyy  = Allocate<Real_t>(numElem) ;
-      Real_p sigzz  = Allocate<Real_t>(numElem) ;
-      Real_p determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain->p, domain->q,
-                              sigxx, sigyy, sigzz, domain->domElemList);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( numElem, domain->nodelist,
-                               domain->x, domain->y, domain->z,
-                               domain->fx, domain->fy, domain->fz,
-                               sigxx, sigyy, sigzz, determ,
-                               domain->nodeElemStart,
-                               domain->nodeElemCornerList,
-                               domain->domElemList, domain->domNodeList) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-      RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] RAJA_DEVICE (int k) {
-         minvol.min(determ[k]);
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-         exit(VolumeError) ;
-      }
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcForceForNodes(Domain *domain)
-{
-  RAJA::forall<node_exec_policy>(*domain->domNodeList, [=] RAJA_DEVICE (int i) {
-     domain->fx[i] = Real_t(0.0) ;
-     domain->fy[i] = Real_t(0.0) ;
-     domain->fz[i] = Real_t(0.0) ;
-   }
-  ) ;
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Real_p xdd, Real_p ydd, Real_p zdd,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p nodalMass, RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int i) {
-      xdd[i] = fx[i] / nodalMass[i];
-      ydd[i] = fy[i] / nodalMass[i];
-      zdd[i] = fz[i] / nodalMass[i];
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Real_p xdd, Real_p ydd,
-                                                 Real_p zdd, Index_p symmX,
-                                                 Index_p symmY, Index_p symmZ,
-                                                 Index_t size)
-{
-  Index_t numNodeBC = (size+1)*(size+1) ;
-
-  /*  !!! Interesting FT discussion here -- not converted !!! */
-  /* What if the array index is corrupted? Out of bounds? */
-  RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] RAJA_DEVICE (int i) {
-     xdd[symmX[i]] = Real_t(0.0) ;
-     ydd[symmY[i]] = Real_t(0.0) ;
-     zdd[symmZ[i]] = Real_t(0.0) ;
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Index_t numNode, Real_p xd,  Real_p yd,  Real_p zd,
-                          Real_p xdd, Real_p ydd, Real_p zdd,
-                          const Real_t dt, const Real_t u_cut,
-                          RAJA::IndexSet *domNodeList)
-{
-   Real_p xd_tmp = Allocate<Real_t>(numNode) ;
-   Real_p yd_tmp = Allocate<Real_t>(numNode) ;
-   Real_p zd_tmp = Allocate<Real_t>(numNode) ;
-
-   /* for FT */
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-      xd_tmp[i] = xd[i] ;
-      yd_tmp[i] = yd[i] ;
-      zd_tmp[i] = zd[i] ;
-    }
-   ) ;
-
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = xd_tmp[i] + xdd[i] * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     xd[i] = xdtmp ;
-
-     ydtmp = yd_tmp[i] + ydd[i] * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     yd[i] = ydtmp ;
-
-     zdtmp = zd_tmp[i] + zdd[i] * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     zd[i] = zdtmp ;
-    }
-   ) ;
-
-   Release(&zd_tmp) ;
-   Release(&yd_tmp) ;
-   Release(&xd_tmp) ;
-}
-
-RAJA_STORAGE
-void CalcPositionForNodes(Index_t numNode, Real_p x,  Real_p y,  Real_p z,
-                          Real_p xd, Real_p yd, Real_p zd,
-                          const Real_t dt, RAJA::IndexSet *domNodeList)
-{
-   Real_p x_tmp = Allocate<Real_t>(numNode) ;
-   Real_p y_tmp = Allocate<Real_t>(numNode) ;
-   Real_p z_tmp = Allocate<Real_t>(numNode) ;
-
-   /* for FT */
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-      x_tmp[i] = x[i] ;
-      y_tmp[i] = y[i] ;
-      z_tmp[i] = z[i] ;
-    }
-   ) ;
-
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-     x[i] = x_tmp[i] + xd[i] * dt ;
-     y[i] = y_tmp[i] + yd[i] * dt ;
-     z[i] = z_tmp[i] + zd[i] * dt ;
-    }
-   ) ;
-
-   Release(&z_tmp) ;
-   Release(&y_tmp) ;
-   Release(&x_tmp) ;
-}
-
-RAJA_STORAGE
-void LagrangeNodal(Domain *domain)
-{
-  const Real_t delt = domain->deltatime ;
-  Real_t u_cut = domain->u_cut ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-  CalcAccelerationForNodes(domain->xdd, domain->ydd, domain->zdd,
-                           domain->fx, domain->fy, domain->fz,
-                           domain->nodalMass, domain->domNodeList);
-
-  ApplyAccelerationBoundaryConditionsForNodes(domain->xdd, domain->ydd,
-                                              domain->zdd, domain->symmX,
-                                              domain->symmY, domain->symmZ,
-                                              domain->sizeX );
-
-  CalcVelocityForNodes( domain->numNode,
-                        domain->xd,  domain->yd,  domain->zd,
-                        domain->xdd, domain->ydd, domain->zdd,
-                        delt, u_cut, domain->domNodeList) ;
-
-  CalcPositionForNodes( domain->numNode,
-                        domain->x,  domain->y,  domain->z,
-                        domain->xd, domain->yd, domain->zd,
-                        delt, domain->domNodeList );
-
-  return;
-}
-
-RAJA_STORAGE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-RAJA_STORAGE
-Real_t CalcElemVolume(
-                       const_Real_p x, const_Real_p y, const_Real_p z
-                     )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-RAJA_STORAGE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-RAJA_STORAGE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-RAJA_STORAGE
-void CalcElemVelocityGrandient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-RAJA_STORAGE
-void CalcKinematicsForElems( Index_p nodelist,
-                             Real_p x,   Real_p y,   Real_p z,
-                             Real_p xd,  Real_p yd,  Real_p zd,
-                             Real_p dxx, Real_p dyy, Real_p dzz,
-                             Real_p v, Real_p volo,
-                             Real_p vnew, Real_p delv, Real_p arealg,
-                             Real_t deltaTime, RAJA::IndexSet *domElemList )
-{
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int k) {
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_p elemToNode = &nodelist[8*k] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / volo[k] ;
-    vnew[k] = relativeVolume ;
-    delv[k] = relativeVolume - v[k] ;
-
-    // set characteristic length
-    arealg[k] = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = xd[gnode];
-      yd_local[lnode] = yd[gnode];
-      zd_local[lnode] = zd[gnode];
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGrandient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    dxx[k] = D[0];
-    dyy[k] = D[1];
-    dzz[k] = D[2];
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime ;
-      Real_p dxx_tmp = Allocate<Real_t>(numElem) ;
-      Real_p dyy_tmp = Allocate<Real_t>(numElem) ;
-      Real_p dzz_tmp = Allocate<Real_t>(numElem) ;
-
-      domain->dxx  = Allocate<Real_t>(numElem) ; /* principal strains */
-      domain->dyy  = Allocate<Real_t>(numElem) ;
-      domain->dzz  = Allocate<Real_t>(numElem) ;
-
-      CalcKinematicsForElems(domain->nodelist,
-                             domain->x, domain->y, domain->z,
-                             domain->xd, domain->yd, domain->zd,
-                             domain->dxx, domain->dyy, domain->dzz,
-                             domain->v, domain->volo,
-                             domain->vnew, domain->delv, domain->arealg,
-                             deltatime, domain->domElemList) ;
-
-
-
-      /* For FT... since domain dxx, dyy, dzz are not used, not really needed */
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] RAJA_DEVICE (int k) {
-         dxx_tmp[k] =  domain->dxx[k] ;
-         dyy_tmp[k] =  domain->dyy[k] ;
-         dzz_tmp[k] =  domain->dzz[k] ;
-       }
-      ) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] RAJA_DEVICE (int k) {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = dxx_tmp[k] + dyy_tmp[k] + dzz_tmp[k] ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        domain->vdov[k] = vdov ;
-        domain->dxx[k] = dxx_tmp[k] - vdovthird ;
-        domain->dyy[k] = dyy_tmp[k] - vdovthird ;
-        domain->dzz[k] = dzz_tmp[k] - vdovthird ;
-
-        minvol.min(domain->vnew[k]);
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-         exit(VolumeError) ;
-      }
-
-      Release(&domain->dzz) ;
-      Release(&domain->dyy) ;
-      Release(&domain->dxx) ;
-
-      Release(&dzz_tmp) ;
-      Release(&dyy_tmp) ;
-      Release(&dxx_tmp) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Real_p x,  Real_p y,  Real_p z,
-                                     Real_p xd, Real_p yd, Real_p zd,
-                                     Real_p volo, Real_p vnew,
-                                     Real_p delv_xi,
-                                     Real_p delv_eta,
-                                     Real_p delv_zeta,
-                                     Real_p delx_xi,
-                                     Real_p delx_eta,
-                                     Real_p delx_zeta,
-                                     Index_p nodelist,
-                                     RAJA::IndexSet *domElemList)
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      Index_p elemToNode = &nodelist[8*i];
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = x[n0] ;
-      Real_t x1 = x[n1] ;
-      Real_t x2 = x[n2] ;
-      Real_t x3 = x[n3] ;
-      Real_t x4 = x[n4] ;
-      Real_t x5 = x[n5] ;
-      Real_t x6 = x[n6] ;
-      Real_t x7 = x[n7] ;
-
-      Real_t y0 = y[n0] ;
-      Real_t y1 = y[n1] ;
-      Real_t y2 = y[n2] ;
-      Real_t y3 = y[n3] ;
-      Real_t y4 = y[n4] ;
-      Real_t y5 = y[n5] ;
-      Real_t y6 = y[n6] ;
-      Real_t y7 = y[n7] ;
-
-      Real_t z0 = z[n0] ;
-      Real_t z1 = z[n1] ;
-      Real_t z2 = z[n2] ;
-      Real_t z3 = z[n3] ;
-      Real_t z4 = z[n4] ;
-      Real_t z5 = z[n5] ;
-      Real_t z6 = z[n6] ;
-      Real_t z7 = z[n7] ;
-
-      Real_t xv0 = xd[n0] ;
-      Real_t xv1 = xd[n1] ;
-      Real_t xv2 = xd[n2] ;
-      Real_t xv3 = xd[n3] ;
-      Real_t xv4 = xd[n4] ;
-      Real_t xv5 = xd[n5] ;
-      Real_t xv6 = xd[n6] ;
-      Real_t xv7 = xd[n7] ;
-
-      Real_t yv0 = yd[n0] ;
-      Real_t yv1 = yd[n1] ;
-      Real_t yv2 = yd[n2] ;
-      Real_t yv3 = yd[n3] ;
-      Real_t yv4 = yd[n4] ;
-      Real_t yv5 = yd[n5] ;
-      Real_t yv6 = yd[n6] ;
-      Real_t yv7 = yd[n7] ;
-
-      Real_t zv0 = zd[n0] ;
-      Real_t zv1 = zd[n1] ;
-      Real_t zv2 = zd[n2] ;
-      Real_t zv3 = zd[n3] ;
-      Real_t zv4 = zd[n4] ;
-      Real_t zv5 = zd[n5] ;
-      Real_t zv6 = zd[n6] ;
-      Real_t zv7 = zd[n7] ;
-
-      Real_t vol = volo[i]*vnew[i] ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
-    }
-   ) ;
-
-#undef SUM4
-}
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(
-                           RAJA::IndexSet *matElemList, Index_p elemBC,
-                           Index_p lxim,   Index_p lxip,
-                           Index_p letam,  Index_p letap,
-                           Index_p lzetam, Index_p lzetap,
-                           Real_p delv_xi,Real_p delv_eta,Real_p delv_zeta,
-                           Real_p delx_xi,Real_p delx_eta,Real_p delx_zeta,
-                           Real_p vdov, Real_p volo, Real_p vnew,
-                           Real_p elemMass, Real_p qq, Real_p ql,
-                           Real_t qlc_monoq, Real_t qqc_monoq,
-                           Real_t monoq_limiter_mult,
-                           Real_t monoq_max_slope,
-                           Real_t ptiny )
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = elemBC[i] ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( delv_xi[i] + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = delv_xi[lxim[i]] ; break ;
-         case XI_M_SYMM: delvm = delv_xi[i] ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = delv_xi[lxip[i]] ; break ;
-         case XI_P_SYMM: delvp = delv_xi[i] ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( delv_eta[i] + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = delv_eta[letam[i]] ; break ;
-         case ETA_M_SYMM: delvm = delv_eta[i] ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = delv_eta[letap[i]] ; break ;
-         case ETA_P_SYMM: delvp = delv_eta[i] ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( delv_zeta[i] + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = delv_zeta[lzetam[i]] ; break ;
-         case ZETA_M_SYMM: delvm = delv_zeta[i] ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = delv_zeta[lzetap[i]] ; break ;
-         case ZETA_P_SYMM: delvp = delv_zeta[i] ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( vdov[i] > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = delv_xi[i]   * delx_xi[i]   ;
-         Real_t delvxeta  = delv_eta[i]  * delx_eta[i]  ;
-         Real_t delvxzeta = delv_zeta[i] * delx_zeta[i] ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = elemMass[i] / (volo[i] * vnew[i]) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      qq[i] = qquad ;
-      ql[i] = qlin  ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain *domain)
-{  
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      //
-      // initialize parameters
-      // 
-      const Real_t ptiny = Real_t(1.e-36) ;
-
-      CalcMonotonicQRegionForElems(
-                           domain->matElemList, domain->elemBC,
-                           domain->lxim,   domain->lxip,
-                           domain->letam,  domain->letap,
-                           domain->lzetam, domain->lzetap,
-                           domain->delv_xi,domain->delv_eta,domain->delv_zeta,
-                           domain->delx_xi,domain->delx_eta,domain->delx_zeta,
-                           domain->vdov, domain->volo, domain->vnew,
-                           domain->elemMass, domain->qq, domain->ql,
-                           domain->qlc_monoq, domain->qqc_monoq,
-                           domain->monoq_limiter_mult,
-                           domain->monoq_max_slope,
-                           ptiny );
-   }
-}
-
-RAJA_STORAGE
-void CalcQForElems(Domain *domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem ;
-
-   if (numElem != 0) {
-      /* allocate domain length arrays */
-
-      domain->delv_xi = Allocate<Real_t>(numElem) ;   /* velocity gradient */
-      domain->delv_eta = Allocate<Real_t>(numElem) ;
-      domain->delv_zeta = Allocate<Real_t>(numElem) ;
-
-      domain->delx_xi = Allocate<Real_t>(numElem) ;   /* position gradient */
-      domain->delx_eta = Allocate<Real_t>(numElem) ;
-      domain->delx_zeta = Allocate<Real_t>(numElem) ;
-
-      /* Calculate velocity gradients, applied at the domain level */
-      CalcMonotonicQGradientsForElems(domain->x,  domain->y,  domain->z,
-                                      domain->xd, domain->yd, domain->zd,
-                                      domain->volo, domain->vnew,
-                                      domain->delv_xi,
-                                      domain->delv_eta,
-                                      domain->delv_zeta,
-                                      domain->delx_xi,
-                                      domain->delx_eta,
-                                      domain->delx_zeta,
-                                      domain->nodelist,
-                                      domain->domElemList) ;
-
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      /* This will be applied at the region level */
-      CalcMonotonicQForElems(domain) ;
-
-      /* release domain length arrays */
-
-      Release(&domain->delx_zeta) ;
-      Release(&domain->delx_eta) ;
-      Release(&domain->delx_xi) ;
-
-      Release(&domain->delv_zeta) ;
-      Release(&domain->delv_eta) ;
-      Release(&domain->delv_xi) ;
-
-      /* Don't allow excessive artificial viscosity */
-      Real_t qstop = domain->qstop ;
-      Index_t idx = -1; 
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] RAJA_DEVICE (int i) {
-         if ( domain->q[i] > qstop ) {
-            idx = i ;
-            // break ;
-         }
-       }
-      ) ;
-
-      if(idx >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_p p_new, Real_p bvc,
-                          Real_p pbvc, Real_p e_old,
-                          Real_p compression, Real_p vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          RAJA::IndexSet *matElemList)
-{
-   const Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcEnergyForElems(Real_p p_new, Real_p e_new, Real_p q_new,
-                        Real_p bvc, Real_p pbvc,
-                        Real_p p_old, Real_p e_old, Real_p q_old,
-                        Real_p compression, Real_p compHalfStep,
-                        Real_p vnewc, Real_p work, Real_p delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_p qq_old, Real_p ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        RAJA::IndexSet *matElemList,
-                        Index_t length)
-{
-   const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-   Real_p pHalfStep = Allocate<Real_t>(length) ;
-   Real_p e_new_tmp = Allocate<Real_t>(length) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   /* for FT */
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      e_new_tmp[i] = e_new[i] ;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new_tmp[i] + Real_t(0.5) * (delvc[i]
-           * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) + work[i] ) ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   /* for FT */
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      e_new_tmp[i] = e_new[i] ;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new_tmp[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                   - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                   + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-    }
-   ) ;
-
-   Release(&e_new_tmp) ;
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                            Real_p vnewc, Real_t rho0, Real_p enewc,
-                            Real_p pnewc, Real_p pbvc,
-                            Real_p bvc, Real_t ss4o3)
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int iz) {
-      Real_t ssTmp = (pbvc[iz] * enewc[iz] + vnewc[iz] * vnewc[iz] *
-                 bvc[iz] * pnewc[iz]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      ss[iz] = ssTmp ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain *domain, Real_p vnewc, Index_t numElem)
-{
-   Real_t  e_cut = domain->e_cut ;
-   Real_t  p_cut = domain->p_cut ;
-   Real_t  ss4o3 = domain->ss4o3 ;
-   Real_t  q_cut = domain->q_cut ;
-
-   Real_t eosvmax = domain->eosvmax ;
-   Real_t eosvmin = domain->eosvmin ;
-   Real_t pmin    = domain->pmin ;
-   Real_t emin    = domain->emin ;
-   Real_t rho0    = domain->refdens ;
-
-   /* allocate *domain length* arrays.  */
-   /* wastes memory, but allows us to get */
-   /* around a "temporary workset" issue */
-   /* we have not yet addressed. */
-   Real_p delvc = domain->delv ;
-   Real_p p_old = Allocate<Real_t>(numElem) ;
-   Real_p compression = Allocate<Real_t>(numElem) ;
-   Real_p compHalfStep = Allocate<Real_t>(numElem) ;
-   Real_p work = Allocate<Real_t>(numElem) ;
-   Real_p p_new = Allocate<Real_t>(numElem) ;
-   Real_p e_new = Allocate<Real_t>(numElem) ;
-   Real_p q_new = Allocate<Real_t>(numElem) ;
-   Real_p bvc = Allocate<Real_t>(numElem) ;
-   Real_p pbvc = Allocate<Real_t>(numElem) ;
-
-   /* compress data, minimal set */
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      p_old[zidx] = domain->p[zidx] ;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      Real_t vchalf ;
-      compression[zidx] = Real_t(1.) / vnewc[zidx] - Real_t(1.);
-      vchalf = vnewc[zidx] - delvc[zidx] * Real_t(.5);
-      compHalfStep[zidx] = Real_t(1.) / vchalf - Real_t(1.);
-    }
-   ) ;
-
-   /* Check for v > eosvmax or v < eosvmin */
-   if ( eosvmin != Real_t(0.) ) {
-      RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-         if (vnewc[zidx] <= eosvmin) { /* impossible due to calling func? */
-            compHalfStep[zidx] = compression[zidx] ;
-         }
-       }
-      ) ;
-   }
-   if ( eosvmax != Real_t(0.) ) {
-      RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-         if (vnewc[zidx] >= eosvmax) { /* impossible due to calling func? */
-            p_old[zidx]        = Real_t(0.) ;
-            compression[zidx]  = Real_t(0.) ;
-            compHalfStep[zidx] = Real_t(0.) ;
-         }
-       }
-      ) ;
-   }
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      work[zidx] = Real_t(0.) ; 
-    }
-   ) ;
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, domain->e,  domain->q, compression, compHalfStep,
-                 vnewc, work,  delvc, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 domain->qq, domain->ql, rho0, eosvmax,
-                 domain->matElemList, numElem);
-
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      domain->p[zidx] = p_new[zidx] ;
-      domain->e[zidx] = e_new[zidx] ;
-      domain->q[zidx] = q_new[zidx] ;
-    }
-   ) ;
-
-   CalcSoundSpeedForElems(domain->matElemList, domain->ss,
-             vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&p_old) ;
-}
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain *domain)
-{
-  Index_t numElem = domain->numElem ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin ;
-    Real_t eosvmax = domain->eosvmax ;
-
-    /* create a domain length (not material length) temporary */
-    /* we are assuming here that the number of dense ranges is */
-    /* much greater than the number of sigletons.  We are also */
-    /* assuming it is ok to allocate a domain length temporary */
-    /* rather than a material length temporary. */
-
-    Real_p vnewc = Allocate<Real_t>(numElem) ;
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zn) {
-       vnewc[zn] = domain->vnew[zn] ;
-
-       if (eosvmin != Real_t(0.)) {
-          if (vnewc[zn] < eosvmin) {
-             vnewc[zn] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-          if (vnewc[zn] > eosvmax) {
-             vnewc[zn] = eosvmax ;
-          }
-       }
-
-     }
-    ) ;
-
-    // check for negative element volume
-    RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zn) {
-       Real_t vc = domain->v[zn] ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin) {
-             vc = eosvmin ;
-          }
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax) {
-             vc = eosvmax ;
-          }
-       }
-
-       minvol.min(vc);
-     }
-    ) ;
-
-    if ( Real_t(minvol) <= Real_t(0.) ) {
-       exit(VolumeError) ;
-    }
-
-    EvalEOSForElems(domain, vnewc, numElem);
-
-    Release(&vnewc) ;
-
-  }
-}
-
-RAJA_STORAGE
-void UpdateVolumesForElems(Real_p vnew, Real_p v,
-                           Real_t v_cut, Index_t length)
-{
-   if (length != 0) {
-      RAJA::forall<range_exec_policy>( int(0), int(length), [=] RAJA_DEVICE (int i) {
-         Real_t tmpV = vnew[i] ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-
-         v[i] = tmpV ;
-       }
-      ) ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void LagrangeElements(Domain *domain, Index_t numElem)
-{
-  /* new relative volume -- temporary */
-  domain->vnew = Allocate<Real_t>(numElem) ;
-
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain->vnew, domain->v,
-                        domain->v_cut, numElem) ;
-
-  Release(&domain->vnew) ;
-}
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                                   Real_p vdov, Real_p arealg,
-                                   Real_t qqc, Real_t *dtcourant)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(Real_t(1.0e+20)) ;
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-      Real_t dtf = ss[indx] * ss[indx] ;
-      Real_t dtf_cmp ;
-
-      if ( vdov[indx] < Real_t(0.) ) {
-         dtf += qqc2 * arealg[indx] * arealg[indx] * vdov[indx] * vdov[indx] ;
-      }
-
-      dtf_cmp = (vdov[indx] != Real_t(0.))
-              ?  arealg[indx] / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      *dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(RAJA::IndexSet *matElemList, Real_p vdov,
-                                 Real_t dvovmax, Real_t *dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(Real_t(1.0e+20)) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-
-      Real_t dtvov_cmp = (vdov[indx] != Real_t(0.))
-                       ? (dvovmax / (FABS(vdov[indx])+Real_t(1.e-20)))
-                       : Real_t(1.0e+10) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      *dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain *domain) {
-   /* evaluate time constraint */
-   /* normally,  this call is on a per region basis */
-   CalcCourantConstraintForElems(domain->matElemList, domain->ss,
-                                 domain->vdov, domain->arealg,
-                                 domain->qqc, &domain->dtcourant) ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems(domain->matElemList, domain->vdov,
-                               domain->dvovmax, &domain->dthydro) ;
-}
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain *domain)
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem);
-
-   CalcTimeConstraintsForElems(domain);
-
-}
-
-int main(int argc, char *argv[])
-{
-
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   struct Domain domain ;
-   int maxIter = 1024*1024 ;
-
-   Index_t edgeElems = lulesh_edge_elems ;
-
-   for (int i=1; i<argc; ++i) {
-      if (strcmp(argv[i], "-p") == 0) {
-         show_run_progress = true ;
-      }
-      else if (strcmp(argv[i], "-i") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            maxIter = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Iteration (-i) option has bad argument -- ignoring\n") ;
-         }
-      }
-      else if (strcmp(argv[i], "-s") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            edgeElems = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Size (-s) option has bad argument -- ignoring\n") ;
-         }
-      }
-   }
-
-   Index_t edgeNodes = edgeElems+1 ;
-
-#ifdef RAJA_ENABLE_FT
-   /* mock up fault tolerance */
-   sigalrmact.sa_handler = simulate_fault ;
-   sigalrmact.sa_flags = 0 ;
-   sigemptyset(&sigalrmact.sa_mask) ;
-
-   printf("signal handler installed\n") ;
-   if (sigaction(SIGUSR2, &sigalrmact, NULL) < 0) {
-      perror("sigaction") ;
-      exit(2) ;
-   }
-#endif
-
-   /****************************/
-   /*  Print run parameters    */
-   /****************************/
-   printf("LULESH parallel run parameters:\n");
-   printf("\t stop time = %e\n", double(lulesh_stop_time)) ;
-   if ( lulesh_time_step > 0 ) {
-     printf("\t Fixed time step = %e\n", double(lulesh_time_step)) ;
-   } else {
-     printf("\t CFL-controlled: initial time step = %e\n", 
-            double(-lulesh_time_step)) ;
-   }
-   printf("\t Mesh size = %i x %i x %i\n", 
-          edgeElems, edgeElems, edgeElems) ;
-
-   switch (lulesh_tiling_mode) {
-      case Canonical:
-      { 
-         printf("\t Tiling mode is 'Canonical'\n");
-         break;
-      }
-      case Tiled_Index:
-      { 
-         printf("\t Tiling mode is 'Tiled_Index'\n");
-         break;
-      }
-      case Tiled_Order:
-      { 
-         printf("\t Tiling mode is 'Tiled_Order'\n");
-         break;
-      }
-      case Tiled_LockFree:
-      { 
-         printf("\t Tiling mode is 'Canonical'\n");
-         break;
-      }
-      default :
-      {
-         printf("Unknown tiling mode!!!\n");
-      }
-   }
-
-   if (lulesh_tiling_mode != Canonical) {
-      printf("\t Mesh tiling = %i x %i x %i\n",
-             lulesh_xtile, lulesh_ytile, lulesh_ztile) ;
-   }
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   domain.sizeX = edgeElems ;
-   domain.sizeY = edgeElems ;
-   domain.sizeZ = edgeElems ;
-   domain.numElem = edgeElems*edgeElems*edgeElems ;
-
-   domain.numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   Index_t domElems = domain.numElem ;
-   Index_t domNodes = domain.numNode ;
-
-   /*************************/
-   /* allocate field memory */
-   /*************************/
-   
-   /*****************/
-   /* Elem-centered */
-   /*****************/
-
-   /* elemToNode connectivity */
-   domain.nodelist = Allocate<Index_t>(8*domElems) ;
-
-   /* elem connectivity through face */
-   domain.lxim = Allocate<Index_t>(domElems) ;
-   domain.lxip = Allocate<Index_t>(domElems)  ;
-   domain.letam = Allocate<Index_t>(domElems) ;
-   domain.letap = Allocate<Index_t>(domElems) ;
-   domain.lzetam = Allocate<Index_t>(domElems) ;
-   domain.lzetap = Allocate<Index_t>(domElems) ;
-
-   /* elem face symm/free-surface flag */
-   domain.elemBC = Allocate<Int_t>(domElems) ;
-
-   domain.e = Allocate<Real_t>(domElems) ;   /* energy */
-   domain.p = Allocate<Real_t>(domElems) ;   /* pressure */
-
-   domain.q = Allocate<Real_t>(domElems) ;   /* q */
-   domain.ql = Allocate<Real_t>(domElems) ;  /* linear term for q */
-   domain.qq = Allocate<Real_t>(domElems) ;  /* quadratic term for q */
-
-   domain.v = Allocate<Real_t>(domElems) ;     /* relative volume */
-   domain.volo = Allocate<Real_t>(domElems) ;  /* reference volume */
-   domain.delv = Allocate<Real_t>(domElems) ;  /* m_vnew - m_v */
-   domain.vdov = Allocate<Real_t>(domElems) ;  /* volume deriv over volume */
-
-   /* elem characteristic length */
-   domain.arealg = Allocate<Real_t>(domElems) ;
-
-   domain.ss = Allocate<Real_t>(domElems) ;    /* "sound speed" */
-
-   domain.elemMass = Allocate<Real_t>(domElems) ;  /* mass */
-
-   /*****************/
-   /* Node-centered */
-   /*****************/
-
-   domain.x = Allocate<Real_t>(domNodes) ;  /* coordinates */
-   domain.y = Allocate<Real_t>(domNodes)  ;
-   domain.z = Allocate<Real_t>(domNodes)  ;
-
-   domain.xd = Allocate<Real_t>(domNodes) ; /* velocities */
-   domain.yd = Allocate<Real_t>(domNodes)  ;
-   domain.zd = Allocate<Real_t>(domNodes) ;
-
-   domain.xdd = Allocate<Real_t>(domNodes)  ; /* accelerations */
-   domain.ydd = Allocate<Real_t>(domNodes)  ;
-   domain.zdd = Allocate<Real_t>(domNodes)  ;
-
-   domain.fx = Allocate<Real_t>(domNodes) ;  /* forces */
-   domain.fy = Allocate<Real_t>(domNodes) ;
-   domain.fz = Allocate<Real_t>(domNodes) ;
-
-   domain.nodalMass = Allocate<Real_t>(domNodes) ;  /* mass */
-
-   /* Boundary nodesets */
-
-   domain.symmX = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-   domain.symmY = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-   domain.symmZ = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.e[i] = Real_t(0.0) ;
-      domain.p[i] = Real_t(0.0) ;
-      domain.q[i] = Real_t(0.0) ;
-      domain.v[i] = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xd[i] = Real_t(0.0) ;
-      domain.yd[i] = Real_t(0.0) ;
-      domain.zd[i] = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xdd[i] = Real_t(0.0) ;
-      domain.ydd[i] = Real_t(0.0) ;
-      domain.zdd[i] = Real_t(0.0) ;
-   }
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            domain.x[nidx] = tx ;
-            domain.y[nidx] = ty ;
-            domain.z[nidx] = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_p localNode = &domain.nodelist[8*zidx] ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   /* initialize material parameters */
-   domain.dtfixed = Real_t(lulesh_time_step) ;
-   domain.deltatime = Real_t(1.0e-7) ;
-   domain.deltatimemultlb = Real_t(1.1) ;
-   domain.deltatimemultub = Real_t(1.2) ;
-   domain.stoptime  = Real_t(lulesh_stop_time) ;
-   domain.dtcourant = Real_t(1.0e+20) ;
-   domain.dthydro   = Real_t(1.0e+20) ;
-   domain.dtmax     = Real_t(1.0e-2) ;
-   domain.time    = Real_t(0.) ;
-   domain.cycle   = 0 ;
-
-   domain.e_cut = Real_t(1.0e-7) ;
-   domain.p_cut = Real_t(1.0e-7) ;
-   domain.q_cut = Real_t(1.0e-7) ;
-   domain.u_cut = Real_t(1.0e-7) ;
-   domain.v_cut = Real_t(1.0e-10) ;
-
-   domain.hgcoef      = Real_t(3.0) ;
-   domain.ss4o3       = Real_t(4.0)/Real_t(3.0) ;
-
-   domain.qstop              =  Real_t(1.0e+12) ;
-   domain.monoq_max_slope    =  Real_t(1.0) ;
-   domain.monoq_limiter_mult =  Real_t(2.0) ;
-   domain.qlc_monoq          = Real_t(0.5) ;
-   domain.qqc_monoq          = Real_t(2.0)/Real_t(3.0) ;
-   domain.qqc                = Real_t(2.0) ;
-
-   domain.pmin =  Real_t(0.) ;
-   domain.emin = Real_t(-1.0e+15) ;
-
-   domain.dvovmax =  Real_t(0.1) ;
-
-   domain.eosvmax =  Real_t(1.0e+9) ;
-   domain.eosvmin =  Real_t(1.0e-9) ;
-
-   domain.refdens =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.nodalMass[i] = 0.0 ;
-   }
-
-   for (Index_t i=0; i<domElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_p elemToNode = &domain.nodelist[8*i] ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = domain.x[gnode];
-        y_local[lnode] = domain.y[gnode];
-        z_local[lnode] = domain.z[gnode];
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      domain.volo[i] = volume ;
-      domain.elemMass[i] = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         domain.nodalMass[idx] += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* deposit energy */
-   domain.e[0] = Real_t(3.948746e+7) ;
-
-   /* set up symmetry nodesets */
-   nidx = 0 ;
-   for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      Index_t rowInc   = i*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-         domain.symmX[nidx] = planeInc + j*edgeNodes ;
-         domain.symmY[nidx] = planeInc + j ;
-         domain.symmZ[nidx] = rowInc   + j ;
-         ++nidx ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   domain.lxim[0] = 0 ;
-   for (Index_t i=1; i<domElems; ++i) {
-      domain.lxim[i]   = i-1 ;
-      domain.lxip[i-1] = i ;
-   }
-   domain.lxip[domElems-1] = domElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      domain.letam[i] = i ; 
-      domain.letap[domElems-edgeElems+i] = domElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<domElems; ++i) {
-      domain.letam[i] = i-edgeElems ;
-      domain.letap[i-edgeElems] = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      domain.lzetam[i] = i ;
-      domain.lzetap[domElems-edgeElems*edgeElems+i] = domElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<domElems; ++i) {
-      domain.lzetam[i] = i - edgeElems*edgeElems ;
-      domain.lzetap[i-edgeElems*edgeElems] = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.elemBC[i] = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         domain.elemBC[planeInc+j*edgeElems] |= XI_M_SYMM ;
-         domain.elemBC[planeInc+j*edgeElems+edgeElems-1] |= XI_P_FREE ;
-         domain.elemBC[planeInc+j] |= ETA_M_SYMM ;
-         domain.elemBC[planeInc+j+edgeElems*edgeElems-edgeElems] |= ETA_P_FREE ;
-         domain.elemBC[rowInc+j] |= ZETA_M_SYMM ;
-         domain.elemBC[rowInc+j+domElems-edgeElems*edgeElems] |= ZETA_P_FREE ;
-      }
-   }
-
-   /* Create domain IndexSets */
-
-   /* always leave the nodes in a canonical ordering */
-   domain.domNodeList = new RAJA::IndexSet() ;
-   domain.domNodeList->push_back( RAJA::RangeSegment(0, domNodes) );
-
-   domain.domElemList = new RAJA::IndexSet() ;
-   domain.matElemList = new RAJA::IndexSet() ;
-
-   const Index_t xtile = lulesh_xtile ;
-   const Index_t ytile = lulesh_ytile ;
-   const Index_t ztile = lulesh_ztile ;
-
-   if ( lulesh_tiling_mode == Tiled_LockFree ) {
-      printf("Tiled_LockFree ordering not implemented!!! Canonical will be used.\n");
-      lulesh_tiling_mode = Canonical;
-   }
-
-   switch (lulesh_tiling_mode) {
-
-      case Canonical:
-      {
-         domain.domElemList->push_back( RAJA::RangeSegment(0, domElems) );
-
-         /* Create a material IndexSet (entire domain same material for now) */
-         domain.matElemList->push_back( RAJA::RangeSegment(0, domElems) );
-      }
-      break ;
-
-      case Tiled_Index:
-      {
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize = 
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-                  Index_t tileIdx[tileSize] ;
-                  Index_t idx = 0 ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           tileIdx[idx++] = 
-                              (plane*edgeElems + row)*edgeElems + col ;
-                        }
-                     }
-                  }
-                  domain.domElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-                  domain.matElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-               }
-            }
-         }
-      }
-      break ;
-
-      case Tiled_Order:
-      {
-         Index_t idx = 0 ;
-         Index_t perm[domElems] ;
-         Index_t iperm[domElems] ; /* inverse permutation */
-         Index_t tileBegin = 0 ;
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize = 
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           perm[idx] = 
-                              (plane*edgeElems + row)*edgeElems + col ;
-                           iperm[perm[idx]] = idx ;
-                           ++idx ;
-                        }
-                     }
-                  }
-                  Index_t tileEnd = tileBegin + tileSize ;
-                  domain.domElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  domain.matElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  tileBegin = tileEnd ;
-               }
-            }
-         }
-         /* permute nodelist connectivity */
-         {
-            Index_t tmp[8*domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               for (Index_t j=0; j<8; ++j) {
-                  tmp[i*8+j] = domain.nodelist[perm[i]*8+j] ;
-               }
-            }
-            for (Index_t i=0; i<8*domElems; ++i) {
-               domain.nodelist[i] = tmp[i] ;
-            }
-         }
-         /* permute volo */
-         {
-            Real_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.volo[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.volo[i] = tmp[i] ;
-            }
-         }
-         /* permute elemMass */
-         {
-            Real_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.elemMass[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.elemMass[i] = tmp[i] ;
-            }
-         }
-         /* permute lxim, lxip, letam, letap, lzetam, lzetap */
-         {
-            Index_t tmp[6*domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i*6+0] = iperm[domain.lxim[perm[i]]] ;
-               tmp[i*6+1] = iperm[domain.lxip[perm[i]]] ;
-               tmp[i*6+2] = iperm[domain.letam[perm[i]]] ;
-               tmp[i*6+3] = iperm[domain.letap[perm[i]]] ;
-               tmp[i*6+4] = iperm[domain.lzetam[perm[i]]] ;
-               tmp[i*6+5] = iperm[domain.lzetap[perm[i]]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.lxim[i] = tmp[i*6+0] ;
-               domain.lxip[i] = tmp[i*6+1] ;
-               domain.letam[i] = tmp[i*6+2] ;
-               domain.letap[i] = tmp[i*6+3] ;
-               domain.lzetam[i] = tmp[i*6+4] ;
-               domain.lzetap[i] = tmp[i*6+5] ;
-            }
-         }
-         /* permute elemBC */
-         {
-            Int_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.elemBC[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.elemBC[i] = tmp[i] ;
-            }
-         }
-      }
-      break ;
-
-      case Tiled_LockFree:
-      {
-         // NOT IMPLEMENTED!!!
-      }
-      break;
-
-      default :
-      {
-         printf("Unknown index set ordering!!! Left undefined.\n");
-      }
-   }
-
-   // OMP Hack
-   // set up node-centered indexing of elements
-   Index_p nodeElemCount = Allocate<Index_t>(domNodes) ;
-
-   for (Index_t i=0; i<domNodes; ++i) {
-     nodeElemCount[i] = 0 ;
-   }
-
-   for (Index_t i=0; i<domElems; ++i) {
-     Index_p nl = &domain.nodelist[8*i] ;
-     for (Index_t j=0; j < 8; ++j) {
-       ++(nodeElemCount[nl[j]] );
-     }
-   }
-
-   domain.nodeElemStart = Allocate<Index_t>(domNodes+1) ;
-
-   domain.nodeElemStart[0] = 0;
-
-   for (Index_t i=1; i <= domNodes; ++i) {
-     domain.nodeElemStart[i] =
-       domain.nodeElemStart[i-1] + nodeElemCount[i-1] ;
-   }
-
-   domain.nodeElemCornerList =
-      Allocate<Index_t>(domain.nodeElemStart[domNodes]);
-
-   for (Index_t i=0; i < domNodes; ++i) {
-     nodeElemCount[i] = 0;
-   }
-
-   for (Index_t i=0; i < domElems; ++i) {
-     Index_p nl = &domain.nodelist[8*i] ;
-     for (Index_t j=0; j < 8; ++j) {
-       Index_t m = nl[j];
-       Index_t k = i*8 + j ;
-       Index_t offset = domain.nodeElemStart[m] + nodeElemCount[m] ;
-       domain.nodeElemCornerList[offset] = k;
-       ++(nodeElemCount[m]) ;
-     }
-   }
-
-#ifdef DEBUG_LULESH
-   Index_t clSize = domain.nodeElemStart[domNodes] ;
-   for (Index_t i=0; i < clSize; ++i) {
-     Index_t clv = domain.nodeElemCornerList[i] ;
-     if ((clv < 0) || (clv > domElems*8)) {
-       fprintf(stderr,
-        "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-       exit(-1);
-     }
-   }
-#endif
-
-   Release(&nodeElemCount) ;
-
-   /* Fault Tolerance begins here */
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   while((domain.time < domain.stoptime) && (domain.cycle < maxIter)) {
-      TimeIncrement(&domain) ;
-      LagrangeLeapFrog(&domain) ;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-      if ( show_run_progress ) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                domain.cycle,double(domain.time), double(domain.deltatime) ) ;
-      }
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-   return 0 ;
-}
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel.cxx b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel.cxx
deleted file mode 100644
index 7b9c6207b..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-parallel.cxx
+++ /dev/null
@@ -1,3484 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cctype>
-
-#include "RAJA/RAJA.hxx"
-
-#include "RAJA/IndexSetBuilders.hxx"
-
-#include "Timer.hxx"
-
-
-/*
- ***********************************************
- * Set parameters that define how code will run.
- ***********************************************
- */
-
-//
-// Display simulation time and timestep during run.
-//
-bool show_run_progress = false;
-
-//
-// Set stop time and time increment for run.
-//
-// The absolute value of lulesh_time_step sets the first time step increment.
-//   - If < 0, the CFL condition will be used to determine subsequent time
-//     step sizes (with some upper bound on the amount the timestep can grow).
-//   - If > 0, the time step will be fixed for the entire run.
-//
-const double lulesh_stop_time = 1.0e-2;
-const double lulesh_time_step = -1.0e-7;
-
-//
-// Set mesh size (physical domain size is fixed).
-//
-// Mesh will be lulesh_edge_elems^3.
-//
-const int lulesh_edge_elems = 45;
-
-//
-// Set number of tiles in each mesh direction for non-canonical oerderings.
-//
-const int lulesh_xtile = 2;
-const int lulesh_ytile = 2;
-const int lulesh_ztile = 2;
-
-// ########################################################
-// Execution policies for loop patterns in LULESH are
-// defined in this header file. Set USE_CASE for desired
-// execution.
-// ########################################################
-#include "luleshPolicy.hxx"
-
-
-//
-// use RAJA data types for loop operations using RAJA
-//
-typedef RAJA::Index_type  Index_t ; /* array subscript and loop index */
-typedef RAJA::Real_type   Real_t ;  /* floating point representation */
-typedef RAJA::Real_ptr    Real_p;
-typedef RAJA::const_Real_ptr    const_Real_p;
-typedef RAJA::Index_type* Index_p;
-
-//#define RAJA_STORAGE static inline
-#define RAJA_STORAGE
-
-
-/****************************************************/
-/*                                                  */
-/* Allow flexibility for arithmetic representations */
-/*                                                  */
-/* Think about how to make this consistent w/RAJA   */
-/* type parameterization (above)!!                  */
-/*                                                  */
-/****************************************************/
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Int_t ;   /* integer representation */
-
-inline RAJA_DEVICE
-real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline RAJA_DEVICE
-real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline RAJA_DEVICE
-real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline RAJA_DEVICE
-real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline RAJA_DEVICE
-real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline RAJA_DEVICE
-real8  FABS(real8  arg) { return fabs(arg) ; }
-inline RAJA_DEVICE
-real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-// ########################################################
-//  Memory allocate/release routines
-// ########################################################
-#include "luleshMemory.hxx"
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-/* Manage temporary allocations with a pool */
-RAJA::MemoryPool< Real_t > elemMemPool ;
-
-/***********************************/
-/* Domain structure implementation */
-/***********************************/
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-struct Domain {
-   /* Elem-centered */
-
-   RAJA::IndexSet *domElemList ;   /* elem indexset */
-   RAJA::IndexSet *matElemList ;   /* material indexset */
-   Index_p nodelist ;     /* elemToNode connectivity */
-
-   Index_p lxim ;         /* elem connectivity through face */
-   Index_p lxip ;
-   Index_p letam ;
-   Index_p letap ;
-   Index_p lzetam ;
-   Index_p lzetap ;
-
-   Int_t *elemBC ;         /* elem face symm/free-surface flag */
-
-   Real_p e ;             /* energy */
-
-   Real_p p ;             /* pressure */
-
-   Real_p q ;             /* q */
-   Real_p ql ;            /* linear term for q */
-   Real_p qq ;            /* quadratic term for q */
-
-   Real_p v ;             /* relative volume */
-
-   Real_p volo ;          /* reference volume */
-   Real_p delv ;          /* m_vnew - m_v */
-   Real_p vdov ;          /* volume derivative over volume */
-
-   Real_p arealg ;        /* elem characteristic length */
-
-   Real_p ss ;            /* "sound speed" */
-
-   Real_p elemMass ;      /* mass */
-
-   /* Elem temporaries */
-
-   Real_p vnew ;          /* new relative volume -- temporary */
-
-   Real_p delv_xi ;       /* velocity gradient -- temporary */
-   Real_p delv_eta ;
-   Real_p delv_zeta ;
-
-   Real_p delx_xi ;       /* position gradient -- temporary */
-   Real_p delx_eta ;
-   Real_p delx_zeta ;
-
-   Real_p dxx ;          /* principal strains -- temporary */
-   Real_p dyy ;
-   Real_p dzz ;
-
-   /* Node-centered */
-
-   RAJA::IndexSet *domNodeList ;   /* node indexset */
-
-   /* boundary nodesets */
-   RAJA::IndexSet *symmX ;        /* Nodes on X symmetry plane */
-   RAJA::IndexSet *symmY ;        /* Nodes on Y symmetry plane */
-   RAJA::IndexSet *symmZ ;        /* Nodes on Z symmetry plane */
-
-
-   Real_p x ;             /* coordinates */
-   Real_p y ;
-   Real_p z ;
-
-   Real_p xd ;            /* velocities */
-   Real_p yd ;
-   Real_p zd ;
-
-   Real_p xdd ;           /* accelerations */
-   Real_p ydd ;
-   Real_p zdd ;
-
-   Real_p fx ;            /* forces */
-   Real_p fy ;
-   Real_p fz ;
-
-   Real_p nodalMass ;     /* mass */
-
-#if defined(OMP_FINE_SYNC)
-   Index_p nodeElemStart ;
-   Index_p nodeElemCornerList ;
-#endif
-
-   /* Parameters */
-
-   Real_t  dtfixed ;           /* fixed time increment */
-   Real_t  time ;              /* current time */
-   Real_t  deltatime ;         /* variable time increment */
-   Real_t  deltatimemultlb ;
-   Real_t  deltatimemultub ;
-   Real_t  stoptime ;          /* end time for simulation */
-
-   Real_t  u_cut ;             /* velocity tolerance */
-   Real_t  hgcoef ;            /* hourglass control */
-   Real_t  qstop ;             /* excessive q indicator */
-   Real_t  monoq_max_slope ;
-   Real_t  monoq_limiter_mult ;
-   Real_t  e_cut ;             /* energy tolerance */
-   Real_t  p_cut ;             /* pressure tolerance */
-   Real_t  ss4o3 ;
-   Real_t  q_cut ;             /* q tolerance */
-   Real_t  v_cut ;             /* relative volume tolerance */
-   Real_t  qlc_monoq ;         /* linear term coef for q */
-   Real_t  qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  qqc ;
-   Real_t  eosvmax ;
-   Real_t  eosvmin ;
-   Real_t  pmin ;              /* pressure floor */
-   Real_t  emin ;              /* energy floor */
-   Real_t  dvovmax ;           /* maximum allowable volume change */
-   Real_t  refdens ;           /* reference density */
-
-   Real_t  dtcourant ;         /* courant constraint */
-   Real_t  dthydro ;           /* volume change constraint */
-   Real_t  dtmax ;             /* maximum allowable time increment */
-
-   Int_t   cycle ;             /* iteration count for simulation */
-
-   Index_t sizeX ;
-   Index_t sizeY ;
-   Index_t sizeZ ;
-   Index_t numElem ;
-
-   Index_t numNode ;
-
-   Index_t idx ;
-} ;
-
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-//--------------------------------------------------------------------------
-
-RAJA_STORAGE
-void TimeIncrement(Domain *domain)
-{
-   Real_t targetdt = domain->stoptime - domain->time ;
-
-   if ((domain->dtfixed <= Real_t(0.0)) && (domain->cycle != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain->deltatime ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (domain->dtcourant < newdt) {
-         newdt = domain->dtcourant / Real_t(2.0) ;
-      }
-      if (domain->dthydro < newdt) {
-         newdt = domain->dthydro * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain->deltatimemultlb) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain->deltatimemultub) {
-            newdt = olddt*domain->deltatimemultub ;
-         }
-      }
-
-      if (newdt > domain->dtmax) {
-         newdt = domain->dtmax ;
-      }
-      domain->deltatime = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain->deltatime) &&
-       (targetdt < (Real_t(4.0) * domain->deltatime / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain->deltatime / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain->deltatime) {
-      domain->deltatime = targetdt ;
-   }
-
-   domain->time += domain->deltatime ;
-
-   ++domain->cycle ;
-}
-
-RAJA_STORAGE
-void InitStressTermsForElems(Real_p p, Real_p q,
-                             Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                             RAJA::IndexSet *domElemList)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int idx) {
-      sigxx[idx] = sigyy[idx] = sigzz[idx] =  - p[idx] - q[idx] ;
-     }
-   ) ;
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemShapeFunctionDerivatives( const_Real_p x,
-                                       const_Real_p y,
-                                       const_Real_p z,
-                                       Real_t b[][8],
-                                       Real_t* const volume
-                                     )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemNodeNormals(
-                         Real_p pfx,
-                         Real_p pfy,
-                         Real_p pfz,
-                         const_Real_p x,
-                         const_Real_p y,
-                         const_Real_p z
-                        )
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_p fx, Real_p fy, Real_p fz
-                                )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-RAJA_STORAGE
-void IntegrateStressForElems( Index_p nodelist,
-                              Real_p x,  Real_p y,  Real_p z,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                              Real_p determ,
-                              RAJA::IndexSet *domElemList,
-                              RAJA::IndexSet *domNodeList
-#if defined(OMP_FINE_SYNC)
-                             ,Index_t numElem, Index_p nodeElemStart,
-                              Index_p nodeElemCornerList
-#endif
-                            )
-{
-#if defined(OMP_FINE_SYNC)
-  Real_p fx_elem = elemMemPool.allocate(numElem*8) ;
-  Real_p fy_elem = elemMemPool.allocate(numElem*8) ;
-  Real_p fz_elem = elemMemPool.allocate(numElem*8) ;
-#endif
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int k) {
-    Real_t B[3][8] __attribute__((aligned(32))) ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-#if !defined(OMP_FINE_SYNC)
-    Real_t fx_local[8] ;
-    Real_t fy_local[8] ;
-    Real_t fz_local[8] ;
-#endif
-
-    const Index_p elemNodes = &nodelist[8*k];
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                     B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                         x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-#if !defined(OMP_FINE_SYNC)
-                                 fx_local, fy_local, fz_local
-#else
-                                 &fx_elem[k*8], &fy_elem[k*8], &fz_elem[k*8]
-#endif
-                               ) ;
-
-#if !defined(OMP_FINE_SYNC)
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      fx[gnode] += fx_local[lnode] ;
-      fy[gnode] += fy_local[lnode] ;
-      fz[gnode] += fz_local[lnode] ;
-    }
-#endif
-   }
-  ) ;
-
-#if defined(OMP_FINE_SYNC)
-  RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int gnode) {
-     Index_t count = nodeElemStart[gnode+1] - nodeElemStart[gnode] ;
-     Index_t *cornerList = &nodeElemCornerList[nodeElemStart[gnode]] ;
-     Real_t fx_sum = Real_t(0.0) ;
-     Real_t fy_sum = Real_t(0.0) ;
-     Real_t fz_sum = Real_t(0.0) ;
-     for (Index_t i=0 ; i < count ; ++i) {
-        Index_t elem = cornerList[i] ;
-        fx_sum += fx_elem[elem] ;
-        fy_sum += fy_elem[elem] ;
-        fz_sum += fz_elem[elem] ;
-     }
-     fx[gnode] = fx_sum ;
-     fy[gnode] = fy_sum ;
-     fz[gnode] = fz_sum ;
-   }
-  ) ;
-
-  elemMemPool.release(&fz_elem) ;
-  elemMemPool.release(&fy_elem) ;
-  elemMemPool.release(&fx_elem) ;
-#endif
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CollectDomainNodesToElemNodes(Real_p x, Real_p y, Real_p z,
-                                   Index_p elemToNode,
-                                   Real_p elemX,
-                                   Real_p elemY,
-                                   Real_p elemZ
-                                  )
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = x[nd0i];
-   elemX[1] = x[nd1i];
-   elemX[2] = x[nd2i];
-   elemX[3] = x[nd3i];
-   elemX[4] = x[nd4i];
-   elemX[5] = x[nd5i];
-   elemX[6] = x[nd6i];
-   elemX[7] = x[nd7i];
-
-   elemY[0] = y[nd0i];
-   elemY[1] = y[nd1i];
-   elemY[2] = y[nd2i];
-   elemY[3] = y[nd3i];
-   elemY[4] = y[nd4i];
-   elemY[5] = y[nd5i];
-   elemY[6] = y[nd6i];
-   elemY[7] = y[nd7i];
-
-   elemZ[0] = z[nd0i];
-   elemZ[1] = z[nd1i];
-   elemZ[2] = z[nd2i];
-   elemZ[3] = z[nd3i];
-   elemZ[4] = z[nd4i];
-   elemZ[5] = z[nd5i];
-   elemZ[6] = z[nd6i];
-   elemZ[7] = z[nd7i];
-
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemVolumeDerivative(
-                              Real_p dvdx,
-                              Real_p dvdy,
-                              Real_p dvdz,
-                              const_Real_p x,
-                              const_Real_p y,
-                              const_Real_p z
-                             )
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemFBHourglassForce(
-                              Real_p xd, Real_p yd, Real_p zd,
-                              Real_p hourgam0, Real_p hourgam1,
-                              Real_p hourgam2, Real_p hourgam3,
-                              Real_p hourgam4, Real_p hourgam5,
-                              Real_p hourgam6, Real_p hourgam7,
-                              Real_t coefficient,
-                              Real_p hgfx, Real_p hgfy, Real_p hgfz
-                             )
-{
-   const Index_t i00=0;
-   const Index_t i01=1;
-   const Index_t i02=2;
-   const Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Index_p nodelist,
-                                   Real_p  ss, Real_p  elemMass,
-                                   Real_p  xd, Real_p  yd, Real_p  zd,
-                                   Real_p  fx, Real_p  fy, Real_p  fz,
-                                   Real_p  determ,
-                                   Real_p  x8n, Real_p  y8n, Real_p  z8n,
-                                   Real_p  dvdx, Real_p  dvdy, Real_p  dvdz,
-                                   Real_t hourg, 
-                                   RAJA::IndexSet *domElemList,
-                                   RAJA::IndexSet *domNodeList
-#if defined(OMP_FINE_SYNC)
-                                  ,Index_t numElem, Index_p nodeElemStart,
-                                   Index_p nodeElemCornerList
-#endif
-                                 )
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-#if defined(OMP_FINE_SYNC)
-   Real_p fx_elem = elemMemPool.allocate(numElem*8) ;
-   Real_p fy_elem = elemMemPool.allocate(numElem*8) ;
-   Real_p fz_elem = elemMemPool.allocate(numElem*8) ;
-#endif
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int i2) {
-#if !defined(OMP_FINE_SYNC)
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-#endif
-      Real_t coefficient;
-
-      Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-      Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      // Define this here so code works on both host and device
-      const Real_t ggamma[4][8] =
-      {
-        { Real_t( 1.), Real_t( 1.), Real_t(-1.), Real_t(-1.),
-          Real_t(-1.), Real_t(-1.), Real_t( 1.), Real_t( 1.) },
-
-        { Real_t( 1.), Real_t(-1.), Real_t(-1.), Real_t( 1.),
-          Real_t(-1.), Real_t( 1.), Real_t( 1.), Real_t(-1.) },
-
-        { Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.),
-          Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) },
-
-        { Real_t(-1.), Real_t( 1.), Real_t(-1.), Real_t( 1.),
-          Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) }
-      } ;
-
-      Index_p elemToNode = &nodelist[8*i2];
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * ggamma[i1][0] + x8n[i3+1] * ggamma[i1][1] +
-            x8n[i3+2] * ggamma[i1][2] + x8n[i3+3] * ggamma[i1][3] +
-            x8n[i3+4] * ggamma[i1][4] + x8n[i3+5] * ggamma[i1][5] +
-            x8n[i3+6] * ggamma[i1][6] + x8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * ggamma[i1][0] + y8n[i3+1] * ggamma[i1][1] +
-            y8n[i3+2] * ggamma[i1][2] + y8n[i3+3] * ggamma[i1][3] +
-            y8n[i3+4] * ggamma[i1][4] + y8n[i3+5] * ggamma[i1][5] +
-            y8n[i3+6] * ggamma[i1][6] + y8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * ggamma[i1][0] + z8n[i3+1] * ggamma[i1][1] +
-            z8n[i3+2] * ggamma[i1][2] + z8n[i3+3] * ggamma[i1][3] +
-            z8n[i3+4] * ggamma[i1][4] + z8n[i3+5] * ggamma[i1][5] +
-            z8n[i3+6] * ggamma[i1][6] + z8n[i3+7] * ggamma[i1][7];
-
-         hourgam0[i1] = ggamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = ggamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = ggamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = ggamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = ggamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = ggamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = ggamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = ggamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=ss[i2];
-      mass1=elemMass[i2];
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = xd[n0si2];
-      xd1[1] = xd[n1si2];
-      xd1[2] = xd[n2si2];
-      xd1[3] = xd[n3si2];
-      xd1[4] = xd[n4si2];
-      xd1[5] = xd[n5si2];
-      xd1[6] = xd[n6si2];
-      xd1[7] = xd[n7si2];
-
-      yd1[0] = yd[n0si2];
-      yd1[1] = yd[n1si2];
-      yd1[2] = yd[n2si2];
-      yd1[3] = yd[n3si2];
-      yd1[4] = yd[n4si2];
-      yd1[5] = yd[n5si2];
-      yd1[6] = yd[n6si2];
-      yd1[7] = yd[n7si2];
-
-      zd1[0] = zd[n0si2];
-      zd1[1] = zd[n1si2];
-      zd1[2] = zd[n2si2];
-      zd1[3] = zd[n3si2];
-      zd1[4] = zd[n4si2];
-      zd1[5] = zd[n5si2];
-      zd1[6] = zd[n6si2];
-      zd1[7] = zd[n7si2];
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7, coefficient,
-#if !defined(OMP_FINE_SYNC)
-                      hgfx, hgfy, hgfz
-#else
-                      &fx_elem[i3], &fy_elem[i3], &fz_elem[i3]
-#endif
-                    );
-#if !defined(OMP_FINE_SYNC)
-      fx[n0si2] += hgfx[0] ;
-      fy[n0si2] += hgfy[0] ;
-      fz[n0si2] += hgfz[0] ;
-
-      fx[n1si2] += hgfx[1] ;
-      fy[n1si2] += hgfy[1] ;
-      fz[n1si2] += hgfz[1] ;
-
-      fx[n2si2] += hgfx[2] ;
-      fy[n2si2] += hgfy[2] ;
-      fz[n2si2] += hgfz[2] ;
-
-      fx[n3si2] += hgfx[3] ;
-      fy[n3si2] += hgfy[3] ;
-      fz[n3si2] += hgfz[3] ;
-
-      fx[n4si2] += hgfx[4] ;
-      fy[n4si2] += hgfy[4] ;
-      fz[n4si2] += hgfz[4] ;
-
-      fx[n5si2] += hgfx[5] ;
-      fy[n5si2] += hgfy[5] ;
-      fz[n5si2] += hgfz[5] ;
-
-      fx[n6si2] += hgfx[6] ;
-      fy[n6si2] += hgfy[6] ;
-      fz[n6si2] += hgfz[6] ;
-
-      fx[n7si2] += hgfx[7] ;
-      fy[n7si2] += hgfy[7] ;
-      fz[n7si2] += hgfz[7] ;
-#endif
-
-    }
-   ) ;
-
-#if defined(OMP_FINE_SYNC)
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int gnode) {
-      Index_t count = nodeElemStart[gnode+1] - nodeElemStart[gnode] ;
-      Index_t *cornerList = &nodeElemCornerList[nodeElemStart[gnode]] ;
-      Real_t fx_sum = Real_t(0.0) ;
-      Real_t fy_sum = Real_t(0.0) ;
-      Real_t fz_sum = Real_t(0.0) ;
-      for (Index_t i=0 ; i < count ; ++i) {
-         Index_t elem = cornerList[i] ;
-         fx_sum += fx_elem[elem] ;
-         fy_sum += fy_elem[elem] ;
-         fz_sum += fz_elem[elem] ;
-      }
-      fx[gnode] += fx_sum ;
-      fy[gnode] += fy_sum ;
-      fz[gnode] += fz_sum ;
-    }
-   ) ;
-
-   elemMemPool.release(&fz_elem) ;
-   elemMemPool.release(&fy_elem) ;
-   elemMemPool.release(&fx_elem) ;
-#endif
-}
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain *domain,
-                                  Real_p determ,
-                                  Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_p dvdx = elemMemPool.allocate(numElem8) ;
-   Real_p dvdy = elemMemPool.allocate(numElem8) ;
-   Real_p dvdz = elemMemPool.allocate(numElem8) ;
-   Real_p x8n  = elemMemPool.allocate(numElem8) ;
-   Real_p y8n  = elemMemPool.allocate(numElem8) ;
-   Real_p z8n  = elemMemPool.allocate(numElem8) ;
-
-   // For negative element volume check
-   RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] RAJA_DEVICE (int idx) {
-
-      Index_p elemToNode = &domain->nodelist[8*idx];
-      CollectDomainNodesToElemNodes(domain->x, domain->y, domain->z, elemToNode,
-                                    &x8n[8*idx], &y8n[8*idx], &z8n[8*idx] );
-
-      CalcElemVolumeDerivative(&dvdx[8*idx], &dvdy[8*idx], &dvdz[8*idx],
-                               &x8n[8*idx], &y8n[8*idx], &z8n[8*idx]);
-
-      determ[idx] = domain->volo[idx] * domain->v[idx];
-
-      minvol.min(domain->v[idx]);
-
-    }
-   ) ;
-
-   if ( Real_t(minvol) <= Real_t(0.0) ) {
-      exit(VolumeError) ;
-   } 
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain->nodelist,
-                                    domain->ss, domain->elemMass,
-                                    domain->xd, domain->yd, domain->zd,
-                                    domain->fx, domain->fy, domain->fz,
-                                    determ, 
-                                    x8n, y8n, z8n, 
-                                    dvdx, dvdy, dvdz,
-                                    hgcoef,
-                                    domain->domElemList, domain->domNodeList
-#if defined(OMP_FINE_SYNC)
-                                   ,numElem, domain->nodeElemStart,
-                                    domain->nodeElemCornerList
-#endif
-                                  ) ;
-   }
-
-   elemMemPool.release(&z8n) ;
-   elemMemPool.release(&y8n) ;
-   elemMemPool.release(&x8n) ;
-   elemMemPool.release(&dvdz) ;
-   elemMemPool.release(&dvdy) ;
-   elemMemPool.release(&dvdx) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef ;
-      Real_p sigxx = elemMemPool.allocate(numElem) ;
-      Real_p sigyy = elemMemPool.allocate(numElem) ;
-      Real_p sigzz = elemMemPool.allocate(numElem) ;
-      Real_p determ = elemMemPool.allocate(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain->p, domain->q,
-                              sigxx, sigyy, sigzz, 
-                              domain->domElemList);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain->nodelist,
-                               domain->x, domain->y, domain->z,
-                               domain->fx, domain->fy, domain->fz,
-                               sigxx,
-                               sigyy,
-                               sigzz,
-                               determ,
-                               domain->domElemList, 
-                               domain->domNodeList
-#if defined(OMP_FINE_SYNC)
-                              ,numElem, domain->nodeElemStart,
-                               domain->nodeElemCornerList
-#endif
-                             ) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-      RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] RAJA_DEVICE (int k) {
-         minvol.min(determ[k]);
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-         exit(VolumeError) ;
-      }
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      elemMemPool.release(&determ) ;
-      elemMemPool.release(&sigzz) ;
-      elemMemPool.release(&sigyy) ;
-      elemMemPool.release(&sigxx) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcForceForNodes(Domain *domain)
-{
-  RAJA::forall<node_exec_policy>(*domain->domNodeList, [=] RAJA_DEVICE (int i) {
-     domain->fx[i] = Real_t(0.0) ;
-     domain->fy[i] = Real_t(0.0) ;
-     domain->fz[i] = Real_t(0.0) ;
-   }
-  ) ;
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Real_p xdd, Real_p ydd, Real_p zdd,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p nodalMass, RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] RAJA_DEVICE (int i) {
-      xdd[i] = fx[i] / nodalMass[i];
-      ydd[i] = fy[i] / nodalMass[i];
-      zdd[i] = fz[i] / nodalMass[i];
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Real_p xdd, Real_p ydd,
-                                                 Real_p zdd,
-                                                 RAJA::IndexSet *symmX,
-                                                 RAJA::IndexSet *symmY,
-                                                 RAJA::IndexSet *symmZ)
-{
-   RAJA::forall<symnode_exec_policy>(*symmX, [=] RAJA_DEVICE (int i) {
-      xdd[i] = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(*symmY, [=] RAJA_DEVICE (int i) {
-      ydd[i] = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(*symmZ, [=] RAJA_DEVICE (int i) {
-      zdd[i] = Real_t(0.0) ;
-   } );
-}
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Real_p xd,  Real_p yd,  Real_p zd,
-                          Real_p xdd, Real_p ydd, Real_p zdd,
-                          const Real_t dt, const Real_t u_cut,
-                          RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-
-     Real_t xdtmp = xd[i] + xdd[i] * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     xd[i] = xdtmp ;
-
-     Real_t ydtmp = yd[i] + ydd[i] * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     yd[i] = ydtmp ;
-
-     Real_t zdtmp = zd[i] + zdd[i] * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     zd[i] = zdtmp ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcPositionForNodes(Real_p x,  Real_p y,  Real_p z,
-                          Real_p xd, Real_p yd, Real_p zd,
-                          const Real_t dt, RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] RAJA_DEVICE (int i) {
-     x[i] += xd[i] * dt ;
-     y[i] += yd[i] * dt ;
-     z[i] += zd[i] * dt ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void LagrangeNodal(Domain *domain)
-{
-  const Real_t delt = domain->deltatime ;
-  Real_t u_cut = domain->u_cut ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-  CalcAccelerationForNodes(domain->xdd, domain->ydd, domain->zdd,
-                           domain->fx, domain->fy, domain->fz,
-                           domain->nodalMass, domain->domNodeList);
-
-  ApplyAccelerationBoundaryConditionsForNodes(domain->xdd, domain->ydd,
-                                              domain->zdd, domain->symmX,
-                                              domain->symmY, domain->symmZ);
-
-  CalcVelocityForNodes( domain->xd,  domain->yd,  domain->zd,
-                        domain->xdd, domain->ydd, domain->zdd,
-                        delt, u_cut, domain->domNodeList) ;
-
-  CalcPositionForNodes( domain->x,  domain->y,  domain->z,
-                        domain->xd, domain->yd, domain->zd,
-                        delt, domain->domNodeList );
-
-  return;
-}
-
-RAJA_STORAGE
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-RAJA_STORAGE
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume(
-                       const_Real_p x, const_Real_p y, const_Real_p z
-                     )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemVelocityGrandient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-RAJA_STORAGE
-void CalcKinematicsForElems( Index_p nodelist,
-                             Real_p x,   Real_p y,   Real_p z,
-                             Real_p xd,  Real_p yd,  Real_p zd,
-                             Real_p dxx, Real_p dyy, Real_p dzz,
-                             Real_p v, Real_p volo,
-                             Real_p vnew, Real_p delv, Real_p arealg,
-                             Real_t deltaTime, RAJA::IndexSet *domElemList )
-{
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int k) {
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_p elemToNode = &nodelist[8*k] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / volo[k] ;
-    vnew[k] = relativeVolume ;
-    delv[k] = relativeVolume - v[k] ;
-
-    // set characteristic length
-    arealg[k] = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = xd[gnode];
-      yd_local[lnode] = yd[gnode];
-      zd_local[lnode] = zd[gnode];
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGrandient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    dxx[k] = D[0];
-    dyy[k] = D[1];
-    dzz[k] = D[2];
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime ;
-
-      /* pricipal strains */
-      domain->dxx  = elemMemPool.allocate(numElem) ;
-      domain->dyy  = elemMemPool.allocate(numElem) ;
-      domain->dzz  = elemMemPool.allocate(numElem) ;
-
-      CalcKinematicsForElems(domain->nodelist,
-                             domain->x, domain->y, domain->z,
-                             domain->xd, domain->yd, domain->zd,
-                             domain->dxx, domain->dyy, domain->dzz,
-                             domain->v, domain->volo,
-                             domain->vnew, domain->delv, domain->arealg,
-                             deltatime, domain->domElemList) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] RAJA_DEVICE (int k) {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = domain->dxx[k] + domain->dyy[k] + domain->dzz[k] ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        domain->vdov[k] = vdov ;
-        domain->dxx[k] -= vdovthird ;
-        domain->dyy[k] -= vdovthird ;
-        domain->dzz[k] -= vdovthird ;
-
-        minvol.min(domain->vnew[k]);
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-         exit(VolumeError) ;
-      }
-
-      elemMemPool.release(&domain->dzz) ;
-      elemMemPool.release(&domain->dyy) ;
-      elemMemPool.release(&domain->dxx) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Real_p x,  Real_p y,  Real_p z,
-                                     Real_p xd, Real_p yd, Real_p zd,
-                                     Real_p volo, Real_p vnew,
-                                     Real_p delv_xi,
-                                     Real_p delv_eta,
-                                     Real_p delv_zeta,
-                                     Real_p delx_xi,
-                                     Real_p delx_eta,
-                                     Real_p delx_zeta,
-                                     Index_p nodelist,
-                                     RAJA::IndexSet *domElemList)
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] RAJA_DEVICE (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      Index_p elemToNode = &nodelist[8*i];
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = x[n0] ;
-      Real_t x1 = x[n1] ;
-      Real_t x2 = x[n2] ;
-      Real_t x3 = x[n3] ;
-      Real_t x4 = x[n4] ;
-      Real_t x5 = x[n5] ;
-      Real_t x6 = x[n6] ;
-      Real_t x7 = x[n7] ;
-
-      Real_t y0 = y[n0] ;
-      Real_t y1 = y[n1] ;
-      Real_t y2 = y[n2] ;
-      Real_t y3 = y[n3] ;
-      Real_t y4 = y[n4] ;
-      Real_t y5 = y[n5] ;
-      Real_t y6 = y[n6] ;
-      Real_t y7 = y[n7] ;
-
-      Real_t z0 = z[n0] ;
-      Real_t z1 = z[n1] ;
-      Real_t z2 = z[n2] ;
-      Real_t z3 = z[n3] ;
-      Real_t z4 = z[n4] ;
-      Real_t z5 = z[n5] ;
-      Real_t z6 = z[n6] ;
-      Real_t z7 = z[n7] ;
-
-      Real_t xv0 = xd[n0] ;
-      Real_t xv1 = xd[n1] ;
-      Real_t xv2 = xd[n2] ;
-      Real_t xv3 = xd[n3] ;
-      Real_t xv4 = xd[n4] ;
-      Real_t xv5 = xd[n5] ;
-      Real_t xv6 = xd[n6] ;
-      Real_t xv7 = xd[n7] ;
-
-      Real_t yv0 = yd[n0] ;
-      Real_t yv1 = yd[n1] ;
-      Real_t yv2 = yd[n2] ;
-      Real_t yv3 = yd[n3] ;
-      Real_t yv4 = yd[n4] ;
-      Real_t yv5 = yd[n5] ;
-      Real_t yv6 = yd[n6] ;
-      Real_t yv7 = yd[n7] ;
-
-      Real_t zv0 = zd[n0] ;
-      Real_t zv1 = zd[n1] ;
-      Real_t zv2 = zd[n2] ;
-      Real_t zv3 = zd[n3] ;
-      Real_t zv4 = zd[n4] ;
-      Real_t zv5 = zd[n5] ;
-      Real_t zv6 = zd[n6] ;
-      Real_t zv7 = zd[n7] ;
-
-      Real_t vol = volo[i]*vnew[i] ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
-    }
-   ) ;
-
-#undef SUM4
-}
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(
-                           RAJA::IndexSet *matElemList, Index_p elemBC,
-                           Index_p lxim,   Index_p lxip,
-                           Index_p letam,  Index_p letap,
-                           Index_p lzetam, Index_p lzetap,
-                           Real_p delv_xi,Real_p delv_eta,Real_p delv_zeta,
-                           Real_p delx_xi,Real_p delx_eta,Real_p delx_zeta,
-                           Real_p vdov, Real_p volo, Real_p vnew,
-                           Real_p elemMass, Real_p qq, Real_p ql,
-                           Real_t qlc_monoq, Real_t qqc_monoq,
-                           Real_t monoq_limiter_mult,
-                           Real_t monoq_max_slope,
-                           Real_t ptiny )
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = elemBC[i] ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( delv_xi[i] + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = delv_xi[lxim[i]] ; break ;
-         case XI_M_SYMM: delvm = delv_xi[i] ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = delv_xi[lxip[i]] ; break ;
-         case XI_P_SYMM: delvp = delv_xi[i] ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( delv_eta[i] + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = delv_eta[letam[i]] ; break ;
-         case ETA_M_SYMM: delvm = delv_eta[i] ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = delv_eta[letap[i]] ; break ;
-         case ETA_P_SYMM: delvp = delv_eta[i] ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( delv_zeta[i] + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = delv_zeta[lzetam[i]] ; break ;
-         case ZETA_M_SYMM: delvm = delv_zeta[i] ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = delv_zeta[lzetap[i]] ; break ;
-         case ZETA_P_SYMM: delvp = delv_zeta[i] ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( vdov[i] > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = delv_xi[i]   * delx_xi[i]   ;
-         Real_t delvxeta  = delv_eta[i]  * delx_eta[i]  ;
-         Real_t delvxzeta = delv_zeta[i] * delx_zeta[i] ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = elemMass[i] / (volo[i] * vnew[i]) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      qq[i] = qquad ;
-      ql[i] = qlin  ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain *domain)
-{  
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      //
-      // initialize parameters
-      // 
-      const Real_t ptiny = Real_t(1.e-36) ;
-
-      CalcMonotonicQRegionForElems(
-                           domain->matElemList, domain->elemBC,
-                           domain->lxim,   domain->lxip,
-                           domain->letam,  domain->letap,
-                           domain->lzetam, domain->lzetap,
-                           domain->delv_xi,domain->delv_eta,domain->delv_zeta,
-                           domain->delx_xi,domain->delx_eta,domain->delx_zeta,
-                           domain->vdov, domain->volo, domain->vnew,
-                           domain->elemMass, domain->qq, domain->ql,
-                           domain->qlc_monoq, domain->qqc_monoq,
-                           domain->monoq_limiter_mult,
-                           domain->monoq_max_slope,
-                           ptiny );
-   }
-}
-
-RAJA_STORAGE
-void CalcQForElems(Domain *domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem ;
-
-   if (numElem != 0) {
-      /* allocate domain length arrays */
-
-      /* velocity gradient */
-      domain->delv_xi   = elemMemPool.allocate(numElem) ;
-      domain->delv_eta  = elemMemPool.allocate(numElem) ;
-      domain->delv_zeta = elemMemPool.allocate(numElem) ;
-
-      /* position gradient */
-      domain->delx_xi   = elemMemPool.allocate(numElem) ;
-      domain->delx_eta  = elemMemPool.allocate(numElem) ;
-      domain->delx_zeta = elemMemPool.allocate(numElem) ;
-
-      /* Calculate velocity gradients, applied at the domain level */
-      CalcMonotonicQGradientsForElems(domain->x,  domain->y,  domain->z,
-                                      domain->xd, domain->yd, domain->zd,
-                                      domain->volo, domain->vnew,
-                                      domain->delv_xi,
-                                      domain->delv_eta,
-                                      domain->delv_zeta,
-                                      domain->delx_xi,
-                                      domain->delx_eta,
-                                      domain->delx_zeta,
-                                      domain->nodelist,
-                                      domain->domElemList) ;
-
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      /* This will be applied at the region level */
-      CalcMonotonicQForElems(domain) ;
-
-      /* release domain length arrays */
-
-      elemMemPool.release(&domain->delx_zeta) ;
-      elemMemPool.release(&domain->delx_eta) ;
-      elemMemPool.release(&domain->delx_xi) ;
-
-      elemMemPool.release(&domain->delv_zeta) ;
-      elemMemPool.release(&domain->delv_eta) ;
-      elemMemPool.release(&domain->delv_xi) ;
-
-      /* Don't allow excessive artificial viscosity */
-      Real_t qstop = domain->qstop ;
-      domain->idx = -1; 
-
-      /* Workaround reference capture by using structure field instead */ 
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] RAJA_DEVICE (int i) {
-         if ( domain->q[i] > qstop ) {
-            domain->idx = i ;
-            // break ;
-         }
-       }
-      ) ;
-
-      if (domain->idx >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_p p_new, Real_p bvc,
-                          Real_p pbvc, Real_p e_old,
-                          Real_p compression, Real_p vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          RAJA::IndexSet *matElemList)
-{
-   const Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcEnergyForElems(Real_p p_new, Real_p e_new, Real_p q_new,
-                        Real_p bvc, Real_p pbvc,
-                        Real_p p_old, Real_p e_old, Real_p q_old,
-                        Real_p compression, Real_p compHalfStep,
-                        Real_p vnewc, Real_p work, Real_p delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_p qq_old, Real_p ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        RAJA::IndexSet *matElemList,
-                        Index_t length)
-{
-   const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-   Real_p pHalfStep = elemMemPool.allocate(length) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * (delvc[i]
-           * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) + work[i] ) ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                   - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                   + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int i) {
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-    }
-   ) ;
-
-   elemMemPool.release(&pHalfStep) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                            Real_p vnewc, Real_t rho0, Real_p enewc,
-                            Real_p pnewc, Real_p pbvc,
-                            Real_p bvc, Real_t ss4o3)
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int iz) {
-      Real_t ssTmp = (pbvc[iz] * enewc[iz] + vnewc[iz] * vnewc[iz] *
-                 bvc[iz] * pnewc[iz]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      ss[iz] = ssTmp ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain *domain, Real_p vnewc, Index_t numElem)
-{
-   Real_t  e_cut = domain->e_cut ;
-   Real_t  p_cut = domain->p_cut ;
-   Real_t  ss4o3 = domain->ss4o3 ;
-   Real_t  q_cut = domain->q_cut ;
-
-   Real_t eosvmax = domain->eosvmax ;
-   Real_t eosvmin = domain->eosvmin ;
-   Real_t pmin    = domain->pmin ;
-   Real_t emin    = domain->emin ;
-   Real_t rho0    = domain->refdens ;
-
-   /* allocate *domain length* arrays.  */
-   /* wastes memory, but allows us to get */
-   /* around a "temporary workset" issue */
-   /* we have not yet addressed. */
-   Real_p delvc = domain->delv ;
-   Real_p p_old        = elemMemPool.allocate(numElem) ;
-   Real_p compression  = elemMemPool.allocate(numElem) ;
-   Real_p compHalfStep = elemMemPool.allocate(numElem) ;
-   Real_p work         = elemMemPool.allocate(numElem) ;
-   Real_p p_new        = elemMemPool.allocate(numElem) ;
-   Real_p e_new        = elemMemPool.allocate(numElem) ;
-   Real_p q_new        = elemMemPool.allocate(numElem) ;
-   Real_p bvc          = elemMemPool.allocate(numElem) ;
-   Real_p pbvc         = elemMemPool.allocate(numElem) ;
-
-   /* compress data, minimal set */
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      p_old[zidx] = domain->p[zidx] ;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      Real_t vchalf ;
-      compression[zidx] = Real_t(1.) / vnewc[zidx] - Real_t(1.);
-      vchalf = vnewc[zidx] - delvc[zidx] * Real_t(.5);
-      compHalfStep[zidx] = Real_t(1.) / vchalf - Real_t(1.);
-
-      /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-         if (vnewc[zidx] <= eosvmin) { /* impossible due to calling func? */
-            compHalfStep[zidx] = compression[zidx] ;
-         }
-      }
-
-      if ( eosvmax != Real_t(0.) ) {
-         if (vnewc[zidx] >= eosvmax) { /* impossible due to calling func? */
-            p_old[zidx]        = Real_t(0.) ;
-            compression[zidx]  = Real_t(0.) ;
-            compHalfStep[zidx] = Real_t(0.) ;
-         }
-      }
-    } 
-   ) ;
-   
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      work[zidx] = Real_t(0.) ; 
-    }
-   ) ;
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, domain->e,  domain->q, compression, compHalfStep,
-                 vnewc, work,  delvc, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 domain->qq, domain->ql, rho0, eosvmax,
-                 domain->matElemList, numElem);
-
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zidx) {
-      domain->p[zidx] = p_new[zidx] ;
-      domain->e[zidx] = e_new[zidx] ;
-      domain->q[zidx] = q_new[zidx] ;
-    }
-   ) ;
-
-   CalcSoundSpeedForElems(domain->matElemList, domain->ss,
-             vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3) ;
-
-   elemMemPool.release(&pbvc) ;
-   elemMemPool.release(&bvc) ;
-   elemMemPool.release(&q_new) ;
-   elemMemPool.release(&e_new) ;
-   elemMemPool.release(&p_new) ;
-   elemMemPool.release(&work) ;
-   elemMemPool.release(&compHalfStep) ;
-   elemMemPool.release(&compression) ;
-   elemMemPool.release(&p_old) ;
-}
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain *domain)
-{
-  Index_t numElem = domain->numElem ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin ;
-    Real_t eosvmax = domain->eosvmax ;
-
-    /* create a domain length (not material length) temporary */
-    /* we are assuming here that the number of dense ranges is */
-    /* much greater than the number of sigletons.  We are also */
-    /* assuming it is ok to allocate a domain length temporary */
-    /* rather than a material length temporary. */
-
-    Real_p vnewc = elemMemPool.allocate(numElem) ;
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zn) {
-       vnewc[zn] = domain->vnew[zn] ;
-
-       if (eosvmin != Real_t(0.)) {
-          if (vnewc[zn] < eosvmin) {
-             vnewc[zn] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-          if (vnewc[zn] > eosvmax) {
-             vnewc[zn] = eosvmax ;
-          }
-       }
-
-     }
-    ) ;
-
-    // check for negative element volume
-    RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] RAJA_DEVICE (int zn) {
-       Real_t vc = domain->v[zn] ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin) {
-             vc = eosvmin ;
-          }
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax) {
-             vc = eosvmax ;
-          }
-       }
-
-       minvol.min(vc);
-     }
-    ) ;
-
-    if ( Real_t(minvol) <= Real_t(0.) ) {
-       exit(VolumeError) ;
-    }
-
-    EvalEOSForElems(domain, vnewc, numElem);
-
-    elemMemPool.release(&vnewc) ;
-  }
-}
-
-RAJA_STORAGE
-void UpdateVolumesForElems(RAJA::IndexSet *domElemList,
-                           Real_p vnew, Real_p v, Real_t v_cut)
-{
-   RAJA::forall<elem_exec_policy>( *domElemList, [=] RAJA_DEVICE (int i) {
-      Real_t tmpV = vnew[i] ;
-
-      if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-         tmpV = Real_t(1.0) ;
-
-      v[i] = tmpV ;
-    }
-   ) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void LagrangeElements(Domain *domain, Index_t numElem)
-{
-  /* new relative volume -- temporary */
-  domain->vnew = elemMemPool.allocate(numElem) ;
-
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain->domElemList,
-                        domain->vnew, domain->v, domain->v_cut) ;
-
-  elemMemPool.release(&domain->vnew) ;
-}
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                                   Real_p vdov, Real_p arealg,
-                                   Real_t qqc, Real_t *dtcourant)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(Real_t(1.0e+20)) ;
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-      Real_t dtf = ss[indx] * ss[indx] ;
-
-      if ( vdov[indx] < Real_t(0.) ) {
-         dtf += qqc2 * arealg[indx] * arealg[indx] * vdov[indx] * vdov[indx] ;
-      }
-
-      Real_t dtf_cmp = (vdov[indx] != Real_t(0.))
-                     ?  arealg[indx] / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   Real_t result = Real_t(dtcourantLoc);
-   if (result < Real_t(1.0e+20)) {
-      *dtcourant = result ;
-   }
-  
-   return ;
-}
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(RAJA::IndexSet *matElemList, Real_p vdov,
-                                 Real_t dvovmax, Real_t *dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(Real_t(1.0e+20)) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-
-      Real_t dtvov_cmp = (vdov[indx] != Real_t(0.))
-                       ? (dvovmax / (FABS(vdov[indx])+Real_t(1.e-20)))
-                       : Real_t(1.0e+10) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-   } ) ;
-
-   Real_t result = Real_t(dthydroLoc);
-   if (result < Real_t(1.0e+20)) {
-       *dthydro = result ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain *domain) {
-   /* evaluate time constraint */
-   /* normally,  this call is on a per region basis */
-   CalcCourantConstraintForElems(domain->matElemList, domain->ss,
-                                 domain->vdov, domain->arealg,
-                                 domain->qqc, &domain->dtcourant) ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems(domain->matElemList, domain->vdov,
-                               domain->dvovmax, &domain->dthydro) ;
-}
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain *domain)
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem);
-
-   if (domain->dtfixed <= Real_t(0.0)) {
-      CalcTimeConstraintsForElems(domain);
-   }
-}
-
-int main(int argc, char *argv[])
-{
-
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   int maxIter = 1024*1024 ;
-
-   //
-   // Allocate domain so it can be accessed from host or device code.
-   //
-   Domain* domain = Allocate<Domain>(1);
-
-   Index_t edgeElems = lulesh_edge_elems ;
-
-   for (int i=1; i<argc; ++i) {
-      if (strcmp(argv[i], "-p") == 0) {
-         show_run_progress = true ;
-      }
-      else if (strcmp(argv[i], "-i") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            maxIter = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Iteration (-i) option has bad argument -- ignoring\n") ;
-         }
-      }     
-      else if (strcmp(argv[i], "-s") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            edgeElems = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Size (-s) option has bad argument -- ignoring\n") ;
-         }
-      }
-   }
-
-   Index_t edgeNodes = edgeElems+1 ;
-
-   Index_p perm = 0 ;
-   Index_p iperm = 0 ;
-
-   /****************************/
-   /*  Print run parameters    */
-   /****************************/
-   printf("LULESH parallel run parameters:\n");
-   printf("\t stop time = %e\n", double(lulesh_stop_time)) ;
-   if ( lulesh_time_step > 0 ) {
-     printf("\t Fixed time step = %e\n", double(lulesh_time_step)) ;
-   } else {
-     printf("\t CFL-controlled: initial time step = %e\n", 
-            double(-lulesh_time_step)) ;
-   }
-   printf("\t Mesh size = %i x %i x %i\n", 
-          edgeElems, edgeElems, edgeElems) ;
-
-   switch (lulesh_tiling_mode) {
-      case Canonical:
-      { 
-         printf("\t Tiling mode is 'Canonical'\n");
-#if !defined(OMP_FINE_SYNC)
-#if USE_CASE == LULESH_CANONICAL
-         printf("must have OMP_FINE_SYNC defined for this tiling mode when running with > 1 threads, at present\n") ;
-         exit(-1) ;
-#endif
-#endif
-         break;
-      }
-      case Tiled_Index:
-      { 
-         printf("\t Tiling mode is 'Tiled_Index'\n");
-#if !defined(OMP_FINE_SYNC)
-         printf("must have OMP_FINE_SYNC defined for this tiling mode, at present\n") ;
-         exit(-1) ;
-#endif
-         break;
-      }
-      case Tiled_Order:
-      { 
-         printf("\t Tiling mode is 'Tiled_Order'\n");
-#if !defined(OMP_FINE_SYNC)
-         printf("must have OMP_FINE_SYNC defined for this tiling mode, at present\n") ;
-         exit(-1) ;
-#endif
-         break;
-      }
-      case Tiled_LockFree:
-      { 
-         printf("\t Tiling mode is 'Lock-free chunk'\n");
-         if ( !(std::is_same<Segment_Exec, RAJA::seq_exec>::value ||
-                std::is_same<Segment_Exec, RAJA::simd_exec>::value) ) {
-            printf("Cannot have inner parallelism for this tiling mode\n") ;
-            exit(-1) ;
-         }
-         break;
-      }
-      case Tiled_LockFreeColor:
-      { 
-         printf("\t Tiling mode is 'Lock-free color'\n");
-         if ( !std::is_same<Hybrid_Seg_Iter, RAJA::seq_segit>::value ) {
-            printf("Cannot have outer parallelism for this tiling mode\n") ;
-            exit(-1) ;
-         }
-         break;
-      }
-      case Tiled_LockFreeColorSIMD:
-      { 
-         printf("\t Tiling mode is 'Lock-free color SIMD'\n");
-         if ( !std::is_same<Hybrid_Seg_Iter, RAJA::seq_segit>::value ) {
-            printf("Cannot have outer parallelism for this tiling mode\n") ;
-            exit(-1) ;
-         }
-         break;
-      }
-      default :
-      {
-         printf("Unknown tiling mode!!!\n");
-      }
-   }
-
-   if (lulesh_tiling_mode == Tiled_Index ||
-       lulesh_tiling_mode == Tiled_Order) {
-      printf("\t Mesh tiling = %i x %i x %i\n",
-             lulesh_xtile, lulesh_ytile, lulesh_ztile) ;
-   }
-
-
-   
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   domain->sizeX = edgeElems ;
-   domain->sizeY = edgeElems ;
-   domain->sizeZ = edgeElems ;
-   domain->numElem = edgeElems*edgeElems*edgeElems ;
-
-   domain->numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   Index_t domElems = domain->numElem ;
-   Index_t domNodes = domain->numNode ;
-
-   /**************************************************/
-   /* Create Nodelist, needed by Tiled_LockFreeColor */
-   /**************************************************/
-
-   /* elemToNode connectivity */
-   domain->nodelist = Allocate<Index_t>(8*domElems) ;
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_p localNode = &domain->nodelist[8*zidx] ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   /****************************/
-   /*   Create domain ISets    */
-   /****************************/
-
-   /* always leave the nodes in a canonical ordering */
-   domain->domNodeList = new RAJA::IndexSet() ;
-   domain->domNodeList->push_back( RAJA::RangeSegment(0, domNodes) ) ;
-
-   domain->domElemList = new RAJA::IndexSet() ;
-   domain->matElemList = new RAJA::IndexSet() ;
-
-   const Index_t xtile = lulesh_xtile ;
-   const Index_t ytile = lulesh_ytile ;
-   const Index_t ztile = lulesh_ztile ;
-
-   switch (lulesh_tiling_mode) {
-
-      case Canonical:
-      {
-         domain->domElemList->push_back( RAJA::RangeSegment(0, domElems) );
-
-         /* Create a material ISet (entire domain same material for now) */
-         domain->matElemList->push_back( RAJA::RangeSegment(0, domElems) ) ;
-      }
-      break ;
-
-      case Tiled_Index:
-      {
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize =
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-                  Index_t tileIdx[tileSize] ;
-                  Index_t idx = 0 ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           tileIdx[idx++] =
-                              (plane*edgeElems + row)*edgeElems + col ;
-                        }
-                     }
-                  }
-                  domain->domElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-                  domain->matElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-               }
-            }
-         }
-      }
-      break ;
-
-      case Tiled_Order:
-      {
-         Index_t idx = 0 ;
-         perm  = Allocate<Index_t>(domElems) ;
-         iperm = Allocate<Index_t>(domElems) ; /* inverse permutation */
-         Index_t tileBegin = 0 ;
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize =
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           perm[idx] =
-                              (plane*edgeElems + row)*edgeElems + col ;
-                           iperm[perm[idx]] = idx ;
-                           ++idx ;
-                        }
-                     }
-                  }
-                  Index_t tileEnd = tileBegin + tileSize ;
-                  domain->domElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  domain->matElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  tileBegin = tileEnd ;
-               }
-            }
-         }
-      }
-      break ;
-
-      case Tiled_LockFree:
-      {
-         buildLockFreeBlockIndexset( *domain->domElemList,
-                                     edgeElems, edgeElems, edgeElems) ;
-
-         /* Create a material indexset (entire domain same material for now) */
-         buildLockFreeBlockIndexset ( *domain->matElemList,
-                                      edgeElems, edgeElems, edgeElems) ;
-      }
-      break;
-
-      case Tiled_LockFreeColor:
-      {
-         // printf("Elements:\n") ;
-         buildLockFreeColorIndexset( *domain->domElemList,
-                                     domain->nodelist, domElems, 8, domNodes) ;
-
-         /* Create a material indexset (entire domain same material for now) */
-         // printf("Material:\n") ;
-         buildLockFreeColorIndexset ( *domain->matElemList,
-                                      domain->nodelist, domElems, 8, domNodes) ;
-      }
-      break;
-
-      case Tiled_LockFreeColorSIMD:
-      {
-         perm  = Allocate<Index_t>(domElems) ;
-         iperm = Allocate<Index_t>(domElems) ; /* inverse permutation */
-
-         // printf("Elements:\n") ;
-         buildLockFreeColorIndexset( *domain->domElemList,
-                                     domain->nodelist, domElems, 8, domNodes,
-                                     perm, iperm) ;
-
-         /* Create a material indexset (entire domain same material for now) */
-         // printf("Material:\n") ;
-         buildLockFreeColorIndexset ( *domain->matElemList,
-                                      domain->nodelist, domElems, 8, domNodes,
-                                      perm, iperm) ;
-      }
-      break;
-
-      default :
-      {
-         printf("Only Tiled_LockFree or Canonical is implemented!!!\n");
-         exit(-1) ;
-      }
-   }
-
-   /* Boundary nodesets */
-
-   domain->symmX = new RAJA::IndexSet() ;
-   {
-     Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-     Index_t nidx = 0 ;
-     for (Index_t i=0; i<edgeNodes; ++i) {
-       Index_t planeInc = i*edgeNodes*edgeNodes ;
-       for (Index_t j=0; j<edgeNodes; ++j) {
-         nset[nidx++] = planeInc + j*edgeNodes ;
-       }
-     }
-     domain->symmX->push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-     delete [] nset ;
-   }
-
-   domain->symmY = new RAJA::IndexSet() ;
-   {
-     Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-     Index_t nidx = 0 ;
-     for (Index_t i=0; i<edgeNodes; ++i) {
-       Index_t planeInc = i*edgeNodes*edgeNodes ;
-       for (Index_t j=0; j<edgeNodes; ++j) {
-         nset[nidx++] = planeInc + j ;
-       }
-     }
-     domain->symmY->push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-     delete [] nset ;
-   }
-
-   domain->symmZ = new RAJA::IndexSet() ;
-   {
-     domain->symmZ->push_back( RAJA::RangeSegment(0, edgeNodes*edgeNodes) );
-   }
-
-
-   /*************************/
-   /* allocate field memory */
-   /*************************/
-   
-   /*****************/
-   /* Elem-centered */
-   /*****************/
-
-   /* elem connectivity through face */
-   domain->lxim =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-   domain->lxip =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-   domain->letam =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-   domain->letap =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-   domain->lzetam =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-   domain->lzetap =
-     AllocateTouch<elem_exec_policy, Index_t>(domain->domElemList, domElems) ;
-
-   /* elem face symm/free-surface flag */
-   domain->elemBC =
-     AllocateTouch<elem_exec_policy, Int_t>(domain->domElemList, domElems) ;
-
-   domain->volo = /* reference volume */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   domain->elemMass = /* element mass */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   /*****************/
-   /* Node-centered */
-   /*****************/
-
-   /* coordinates */
-   domain->x =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->y =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes)  ;
-   domain->z =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes)  ;
-
-   /* velocities */
-   domain->xd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->yd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->zd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-
-   /* accelerations */
-   domain->xdd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->ydd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->zdd =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-
-   /* forces */
-   domain->fx =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->fy =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-   domain->fz =
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-
-   domain->nodalMass = /* nodal mass */
-     AllocateTouch<node_exec_policy, Real_t>(domain->domNodeList, domNodes) ;
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            domain->x[nidx] = tx ;
-            domain->y[nidx] = ty ;
-            domain->z[nidx] = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-   /* initialize material parameters */
-   domain->dtfixed = Real_t(lulesh_time_step) ;
-   domain->deltatime = ((domain->dtfixed < Real_t(0.0)) ?
-                         -domain->dtfixed : domain->dtfixed) ; 
-   domain->deltatimemultlb = Real_t(1.1) ;
-   domain->deltatimemultub = Real_t(1.2) ;
-   domain->stoptime  = Real_t(lulesh_stop_time) ;
-   domain->dtcourant = Real_t(1.0e+20) ;
-   domain->dthydro   = Real_t(1.0e+20) ;
-   domain->dtmax     = Real_t(1.0e-2) ;
-   domain->time    = Real_t(0.) ;
-   domain->cycle   = 0 ;
-
-   domain->e_cut = Real_t(1.0e-7) ;
-   domain->p_cut = Real_t(1.0e-7) ;
-   domain->q_cut = Real_t(1.0e-7) ;
-   domain->u_cut = Real_t(1.0e-7) ;
-   domain->v_cut = Real_t(1.0e-10) ;
-
-   domain->hgcoef      = Real_t(3.0) ;
-   domain->ss4o3       = Real_t(4.0)/Real_t(3.0) ;
-
-   domain->qstop              =  Real_t(1.0e+12) ;
-   domain->monoq_max_slope    =  Real_t(1.0) ;
-   domain->monoq_limiter_mult =  Real_t(2.0) ;
-   domain->qlc_monoq          = Real_t(0.5) ;
-   domain->qqc_monoq          = Real_t(2.0)/Real_t(3.0) ;
-   domain->qqc                = Real_t(2.0) ;
-
-   domain->pmin =  Real_t(0.) ;
-   domain->emin = Real_t(-1.0e+15) ;
-
-   domain->dvovmax =  Real_t(0.1) ;
-
-   domain->eosvmax =  Real_t(1.0e+9) ;
-   domain->eosvmin =  Real_t(1.0e-9) ;
-
-   domain->refdens =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<domElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_p elemToNode = &domain->nodelist[8*i] ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = domain->x[gnode];
-        y_local[lnode] = domain->y[gnode];
-        z_local[lnode] = domain->z[gnode];
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      domain->volo[i] = volume ;
-      domain->elemMass[i] = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         domain->nodalMass[idx] += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   domain->lxim[0] = 0 ;
-   for (Index_t i=1; i<domElems; ++i) {
-      domain->lxim[i]   = i-1 ;
-      domain->lxip[i-1] = i ;
-   }
-   domain->lxip[domElems-1] = domElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      domain->letam[i] = i ; 
-      domain->letap[domElems-edgeElems+i] = domElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<domElems; ++i) {
-      domain->letam[i] = i-edgeElems ;
-      domain->letap[i-edgeElems] = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      domain->lzetam[i] = i ;
-      domain->lzetap[domElems-edgeElems*edgeElems+i] = domElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<domElems; ++i) {
-      domain->lzetam[i] = i - edgeElems*edgeElems ;
-      domain->lzetap[i-edgeElems*edgeElems] = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain->elemBC[i] = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         domain->elemBC[planeInc+j*edgeElems] |= XI_M_SYMM ;
-         domain->elemBC[planeInc+j*edgeElems+edgeElems-1] |= XI_P_FREE ;
-         domain->elemBC[planeInc+j] |= ETA_M_SYMM ;
-         domain->elemBC[planeInc+j+edgeElems*edgeElems-edgeElems] |= ETA_P_FREE ;
-         domain->elemBC[rowInc+j] |= ZETA_M_SYMM ;
-         domain->elemBC[rowInc+j+domElems-edgeElems*edgeElems] |= ZETA_P_FREE ;
-      }
-   }
-
-   if (lulesh_tiling_mode == Tiled_Order ||
-       lulesh_tiling_mode == Tiled_LockFreeColorSIMD) {
-      /* permute nodelist connectivity */
-      {
-         Index_t tmp[8*domElems] ;
-         for (Index_t i=0; i<domElems; ++i) {
-            for (Index_t j=0; j<8; ++j) {
-               tmp[i*8+j] = domain->nodelist[perm[i]*8+j] ;
-            }
-         }
-         for (Index_t i=0; i<8*domElems; ++i) {
-            domain->nodelist[i] = tmp[i] ;
-         }
-      }
-      /* permute volo */
-      {
-         Real_t tmp[domElems] ;
-         for (Index_t i=0; i<domElems; ++i) {
-            tmp[i] = domain->volo[perm[i]] ;
-         }
-         for (Index_t i=0; i<domElems; ++i) {
-            domain->volo[i] = tmp[i] ;
-         }
-      }
-      /* permute elemMass */
-      {
-         Real_t tmp[domElems] ;
-         for (Index_t i=0; i<domElems; ++i) {
-            tmp[i] = domain->elemMass[perm[i]] ;
-         }
-         for (Index_t i=0; i<domElems; ++i) {
-            domain->elemMass[i] = tmp[i] ;
-         }
-      }
-      /* permute lxim, lxip, letam, letap, lzetam, lzetap */
-      {
-         Index_t tmp[6*domElems] ;
-         for (Index_t i=0; i<domElems; ++i) {
-            tmp[i*6+0] = iperm[domain->lxim[perm[i]]] ;
-            tmp[i*6+1] = iperm[domain->lxip[perm[i]]] ;
-            tmp[i*6+2] = iperm[domain->letam[perm[i]]] ;
-            tmp[i*6+3] = iperm[domain->letap[perm[i]]] ;
-            tmp[i*6+4] = iperm[domain->lzetam[perm[i]]] ;
-            tmp[i*6+5] = iperm[domain->lzetap[perm[i]]] ;
-         }
-         for (Index_t i=0; i<domElems; ++i) {
-            domain->lxim[i] = tmp[i*6+0] ;
-            domain->lxip[i] = tmp[i*6+1] ;
-            domain->letam[i] = tmp[i*6+2] ;
-            domain->letap[i] = tmp[i*6+3] ;
-            domain->lzetam[i] = tmp[i*6+4] ;
-            domain->lzetap[i] = tmp[i*6+5] ;
-         }
-      }
-      /* permute elemBC */
-      {
-         Int_t tmp[domElems] ;
-         for (Index_t i=0; i<domElems; ++i) {
-            tmp[i] = domain->elemBC[perm[i]] ;
-         }
-         for (Index_t i=0; i<domElems; ++i) {
-            domain->elemBC[i] = tmp[i] ;
-         }
-      }
-      Release(&iperm) ;
-      Release(&perm) ;
-   }
-
-   /*****************/
-   /* Elem-centered */
-   /*****************/
-
-   domain->e =  /* energy */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-   domain->p = /* pressure */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   domain->q = /* artificial viscosity */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-   domain->ql = /* linear term for q */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-   domain->qq = /* quadratic term for q */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   domain->v = /* relative volume */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ; 
-   domain->delv = /* m_vnew - m_v */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-   domain->vdov = /* volume deriv over volume */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   domain->arealg = /* elem characteristic length */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   domain->ss = /* "sound speed" */
-     AllocateTouch<elem_exec_policy, Real_t>(domain->domElemList, domElems) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<domElems; ++i) {
-      domain->v[i] = Real_t(1.0) ;
-
-
-   }
-
-   /* deposit energy -- we know elem zero stays put for all permutations */
-   domain->e[0] = Real_t(3.948746e+7) ;
-
-#if defined(OMP_FINE_SYNC)
-   // set up node-centered indexing of elements
-   {
-   Index_p nodeElemCount = Allocate<Index_t>(domNodes) ;
-
-   for (Index_t i=0; i<domNodes; ++i) {
-     nodeElemCount[i] = 0 ;
-   }
-
-   for (Index_t i=0; i<domElems; ++i) {
-     Index_p nl = &domain->nodelist[8*i] ;
-     for (Index_t j=0; j < 8; ++j) {
-       ++(nodeElemCount[nl[j]] );
-     }
-   }
-
-   domain->nodeElemStart = Allocate<Index_t>(domNodes+1) ;
-
-   domain->nodeElemStart[0] = 0;
-
-   for (Index_t i=1; i <= domNodes; ++i) {
-     domain->nodeElemStart[i] =
-       domain->nodeElemStart[i-1] + nodeElemCount[i-1] ;
-   }
-
-   domain->nodeElemCornerList =
-      Allocate<Index_t>(domain->nodeElemStart[domNodes]);
-
-   for (Index_t i=0; i < domNodes; ++i) {
-     nodeElemCount[i] = 0;
-   }
-
-   for (Index_t i=0; i < domElems; ++i) {
-     Index_p nl = &domain->nodelist[8*i] ;
-     for (Index_t j=0; j < 8; ++j) {
-       Index_t m = nl[j];
-       Index_t k = i*8 + j ;
-       Index_t offset = domain->nodeElemStart[m] + nodeElemCount[m] ;
-       domain->nodeElemCornerList[offset] = k;
-       ++(nodeElemCount[m]) ;
-     }
-   }
-
-#ifdef DEBUG_LULESH
-   Index_t clSize = domain->nodeElemStart[domNodes] ;
-   for (Index_t i=0; i < clSize; ++i) {
-     Index_t clv = domain->nodeElemCornerList[i] ;
-     if ((clv < 0) || (clv > domElems*8)) {
-       fprintf(stderr,
-        "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-       exit(-1);
-     }
-   }
-#endif
-   Release(&nodeElemCount) ;
-   }
-#endif
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   while((domain->time < domain->stoptime) && (domain->cycle < maxIter)) {
-      TimeIncrement(domain) ;
-      LagrangeLeapFrog(domain) ;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-      if ( show_run_progress ) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                int(domain->cycle),double(domain->time), double(domain->deltatime) ) ;
-      }
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-   Release(&domain) ;
-
-   return 0 ;
-}
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-serial.cxx b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-serial.cxx
deleted file mode 100644
index c70257a62..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/luleshRAJA-serial.cxx
+++ /dev/null
@@ -1,3200 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cctype>
-
-#include "RAJA/RAJA.hxx"
-
-#include "RAJA/IndexSetBuilders.hxx"
-
-#include "Timer.hxx"
-
-
-/*
- ***********************************************
- * Set parameters that define how code will run.
- ***********************************************
- */
-
-//
-// Display simulation time and timestep during run.
-//
-bool show_run_progress = false;
-
-//
-// Set stop time and time increment for run.
-//
-// The absolute value of lulesh_time_step sets the first time step increment.
-//   - If < 0, the CFL condition will be used to determine subsequent time
-//     step sizes (with some upper bound on the amount the timestep can grow).
-//   - If > 0, the time step will be fixed for the entire run.
-//
-const double lulesh_stop_time = 1.0e-2;
-const double lulesh_time_step = -1.0e-7;
-
-//
-// Set mesh size (physical domain size is fixed).
-//
-// Mesh will be lulesh_edge_elems^3.
-//
-const int lulesh_edge_elems = 45;
-
-
-//
-//   Tiling mode.
-//
-enum TilingMode
-{
-   Canonical,       // canonical element ordering -- single range segment
-   Tiled_Index,     // canonical ordering, tiled using unstructured segments
-   Tiled_Order,     // elements permuted, tiled using range segments
-   Tiled_LockFree,  // tiled ordering, lock-free
-};
-//TilingMode lulesh_tiling_mode = Canonical;
-//TilingMode lulesh_tiling_mode = Tiled_Index;
-//TilingMode lulesh_tiling_mode = Tiled_Order;
-TilingMode lulesh_tiling_mode = Tiled_LockFree;
-
-//
-// Set number of tiles in each mesh direction for non-canonical oerderings.
-//
-const int lulesh_xtile = 2;
-const int lulesh_ytile = 2;
-const int lulesh_ztile = 2;
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-//   Need to verify if this can be set to RangeSegment or ListSegment
-//   types. It may be useful to compare IndexSet performance to
-//   basic segment types; e.g.,
-//
-//     - Canonical ordering should be able to use IndexSet or
-//                                                RangeSegment.
-//     - Tiled_Index ordering should be able to use IndexSet or
-//                                                  ListSegment.
-//
-//   Policies for index set segment iteration and segment execution.
-//
-//   NOTE: Currently, we apply single policy across all loop patterns.
-//
-typedef RAJA::seq_segit IndexSet_Seg_Iter;
-
-//typedef RAJA::seq_exec Segment_Exec;
-typedef RAJA::simd_exec Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_Seg_Iter, Segment_Exec> minloc_exec_policy;
-typedef                                            Segment_Exec  range_exec_policy;
-
-typedef                                            RAJA::seq_reduce  reduce_policy;
-
-
-//
-// use RAJA data types for loop operations using RAJA
-//
-typedef RAJA::Index_type  Index_t ; /* array subscript and loop index */
-typedef RAJA::Real_type   Real_t ;  /* floating point representation */
-typedef RAJA::Real_ptr    Real_p;
-typedef RAJA::const_Real_ptr   const_Real_p;
-typedef RAJA::Index_type* Index_p;
-
-/****************************************************/
-/*                                                  */
-/* Allow flexibility for arithmetic representations */
-/*                                                  */
-/* Think about how to make this consistent w/RAJA   */
-/* type parameterization (above)!!                  */
-/*                                                  */
-/****************************************************/
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Int_t ;   /* integer representation */
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-#define RAJA_STORAGE static inline
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-
-/*********************************/
-/* Data structure implementation */
-/*********************************/
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-struct Domain {
-   /* Elem-centered */
-
-   RAJA::IndexSet *domElemList ;   /* elem indexset */
-   RAJA::IndexSet *matElemList ;   /* material indexset */
-   Index_p nodelist ;     /* elemToNode connectivity */
-
-   Index_p lxim ;         /* elem connectivity through face */
-   Index_p lxip ;
-   Index_p letam ;
-   Index_p letap ;
-   Index_p lzetam ;
-   Index_p lzetap ;
-
-   Int_t *elemBC ;         /* elem face symm/free-surface flag */
-
-   Real_p e ;             /* energy */
-
-   Real_p p ;             /* pressure */
-
-   Real_p q ;             /* q */
-   Real_p ql ;            /* linear term for q */
-   Real_p qq ;            /* quadratic term for q */
-
-   Real_p v ;             /* relative volume */
-
-   Real_p volo ;          /* reference volume */
-   Real_p delv ;          /* m_vnew - m_v */
-   Real_p vdov ;          /* volume derivative over volume */
-
-   Real_p arealg ;        /* elem characteristic length */
-
-   Real_p ss ;            /* "sound speed" */
-
-   Real_p elemMass ;      /* mass */
-
-   /* Elem temporaries */
-
-   Real_p vnew ;          /* new relative volume -- temporary */
-
-   Real_p delv_xi ;       /* velocity gradient -- temporary */
-   Real_p delv_eta ;
-   Real_p delv_zeta ;
-
-   Real_p delx_xi ;       /* position gradient -- temporary */
-   Real_p delx_eta ;
-   Real_p delx_zeta ;
-
-   Real_p dxx ;          /* principal strains -- temporary */
-   Real_p dyy ;
-   Real_p dzz ;
-
-   /* Node-centered */
-
-   RAJA::IndexSet *domNodeList ;   /* node indexset */
-
-   Real_p x ;             /* coordinates */
-   Real_p y ;
-   Real_p z ;
-
-   Real_p xd ;            /* velocities */
-   Real_p yd ;
-   Real_p zd ;
-
-   Real_p xdd ;           /* accelerations */
-   Real_p ydd ;
-   Real_p zdd ;
-
-   Real_p fx ;            /* forces */
-   Real_p fy ;
-   Real_p fz ;
-
-   Real_p nodalMass ;     /* mass */
-
-
-   /* Boundary nodesets */
-
-   Index_p symmX ;        /* Nodes on X symmetry plane */
-   Index_p symmY ;        /* Nodes on Y symmetry plane */
-   Index_p symmZ ;        /* Nodes on Z symmetry plane */
-
-   /* Parameters */
-
-   Real_t  dtfixed ;           /* fixed time increment */
-   Real_t  time ;              /* current time */
-   Real_t  deltatime ;         /* variable time increment */
-   Real_t  deltatimemultlb ;
-   Real_t  deltatimemultub ;
-   Real_t  stoptime ;          /* end time for simulation */
-
-   Real_t  u_cut ;             /* velocity tolerance */
-   Real_t  hgcoef ;            /* hourglass control */
-   Real_t  qstop ;             /* excessive q indicator */
-   Real_t  monoq_max_slope ;
-   Real_t  monoq_limiter_mult ;
-   Real_t  e_cut ;             /* energy tolerance */
-   Real_t  p_cut ;             /* pressure tolerance */
-   Real_t  ss4o3 ;
-   Real_t  q_cut ;             /* q tolerance */
-   Real_t  v_cut ;             /* relative volume tolerance */
-   Real_t  qlc_monoq ;         /* linear term coef for q */
-   Real_t  qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  qqc ;
-   Real_t  eosvmax ;
-   Real_t  eosvmin ;
-   Real_t  pmin ;              /* pressure floor */
-   Real_t  emin ;              /* energy floor */
-   Real_t  dvovmax ;           /* maximum allowable volume change */
-   Real_t  refdens ;           /* reference density */
-
-   Real_t  dtcourant ;         /* courant constraint */
-   Real_t  dthydro ;           /* volume change constraint */
-   Real_t  dtmax ;             /* maximum allowable time increment */
-
-   Int_t   cycle ;             /* iteration count for simulation */
-
-   Index_t sizeX ;
-   Index_t sizeY ;
-   Index_t sizeZ ;
-   Index_t numElem ;
-
-   Index_t numNode ;
-} ;
-
-// ########################################################
-//  Memory allocate/release routines
-// ########################################################
-#include "luleshMemory.hxx"
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-RAJA_STORAGE
-void TimeIncrement(Domain *domain)
-{
-   Real_t targetdt = domain->stoptime - domain->time ;
-
-   if ((domain->dtfixed <= Real_t(0.0)) && (domain->cycle != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain->deltatime ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (domain->dtcourant < newdt) {
-         newdt = domain->dtcourant / Real_t(2.0) ;
-      }
-      if (domain->dthydro < newdt) {
-         newdt = domain->dthydro * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain->deltatimemultlb) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain->deltatimemultub) {
-            newdt = olddt*domain->deltatimemultub ;
-         }
-      }
-
-      if (newdt > domain->dtmax) {
-         newdt = domain->dtmax ;
-      }
-      domain->deltatime = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain->deltatime) &&
-       (targetdt < (Real_t(4.0) * domain->deltatime / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain->deltatime / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain->deltatime) {
-      domain->deltatime = targetdt ;
-   }
-
-   domain->time += domain->deltatime ;
-
-   ++domain->cycle ;
-}
-
-RAJA_STORAGE
-void InitStressTermsForElems(Real_p p, Real_p q,
-                             Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                             RAJA::IndexSet *domElemList)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] (int idx) {
-      sigxx[idx] = sigyy[idx] = sigzz[idx] =  - p[idx] - q[idx] ;
-     }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcElemShapeFunctionDerivatives( const_Real_p x,
-                                       const_Real_p y,
-                                       const_Real_p z,
-                                       Real_t b[][8],
-                                       Real_t* const volume
-                                     )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-RAJA_STORAGE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-RAJA_STORAGE
-void CalcElemNodeNormals(
-                         Real_p pfx,
-                         Real_p pfy,
-                         Real_p pfz,
-                         const_Real_p x,
-                         const_Real_p y,
-                         const_Real_p z
-                        )
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-RAJA_STORAGE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_p fx, Real_p fy, Real_p fz
-                                )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-RAJA_STORAGE
-void IntegrateStressForElems( Index_p nodelist,
-                              Real_p x,  Real_p y,  Real_p z,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p sigxx, Real_p sigyy, Real_p sigzz,
-                              Real_p determ, RAJA::IndexSet *domElemList)
-{
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] (int k) {
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t fx_local[8] ;
-    Real_t fy_local[8] ;
-    Real_t fz_local[8] ;
-
-    const Index_p elemNodes = &nodelist[8*k];
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                     B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                         x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                 fx_local, fy_local, fz_local ) ;
-
-    // copy nodal force contributions to global force arrray.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      fx[gnode] += fx_local[lnode];
-      fy[gnode] += fy_local[lnode];
-      fz[gnode] += fz_local[lnode];
-    }
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Real_p x, Real_p y, Real_p z,
-                                   Index_p elemToNode,
-                                   Real_p elemX,
-                                   Real_p elemY,
-                                   Real_p elemZ
-                                  )
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = x[nd0i];
-   elemX[1] = x[nd1i];
-   elemX[2] = x[nd2i];
-   elemX[3] = x[nd3i];
-   elemX[4] = x[nd4i];
-   elemX[5] = x[nd5i];
-   elemX[6] = x[nd6i];
-   elemX[7] = x[nd7i];
-
-   elemY[0] = y[nd0i];
-   elemY[1] = y[nd1i];
-   elemY[2] = y[nd2i];
-   elemY[3] = y[nd3i];
-   elemY[4] = y[nd4i];
-   elemY[5] = y[nd5i];
-   elemY[6] = y[nd6i];
-   elemY[7] = y[nd7i];
-
-   elemZ[0] = z[nd0i];
-   elemZ[1] = z[nd1i];
-   elemZ[2] = z[nd2i];
-   elemZ[3] = z[nd3i];
-   elemZ[4] = z[nd4i];
-   elemZ[5] = z[nd5i];
-   elemZ[6] = z[nd6i];
-   elemZ[7] = z[nd7i];
-
-}
-
-RAJA_STORAGE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-RAJA_STORAGE
-void CalcElemVolumeDerivative(
-                              Real_p dvdx,
-                              Real_p dvdy,
-                              Real_p dvdz,
-                              const_Real_p x,
-                              const_Real_p y,
-                              const_Real_p z
-                             )
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-RAJA_STORAGE
-void CalcElemFBHourglassForce(
-                              Real_p xd, Real_p yd, Real_p zd,
-                              Real_p hourgam0, Real_p hourgam1,
-                              Real_p hourgam2, Real_p hourgam3,
-                              Real_p hourgam4, Real_p hourgam5,
-                              Real_p hourgam6, Real_p hourgam7,
-                              Real_t coefficient,
-                              Real_p hgfx, Real_p hgfy, Real_p hgfz
-                             )
-{
-   const Index_t i00=0;
-   const Index_t i01=1;
-   const Index_t i02=2;
-   const Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-const Real_t ggamma[4][8] =
-{
-   { Real_t( 1.), Real_t( 1.), Real_t(-1.), Real_t(-1.),
-     Real_t(-1.), Real_t(-1.), Real_t( 1.), Real_t( 1.) },
-
-   { Real_t( 1.), Real_t(-1.), Real_t(-1.), Real_t( 1.),
-     Real_t(-1.), Real_t( 1.), Real_t( 1.), Real_t(-1.) },
-
-   { Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.),
-     Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) },
-
-   { Real_t(-1.), Real_t( 1.), Real_t(-1.), Real_t( 1.),
-     Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) }
-
-} ;
-
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Index_p nodelist,
-                                   Real_p  ss, Real_p  elemMass,
-                                   Real_p  xd, Real_p  yd, Real_p  zd,
-                                   Real_p  fx, Real_p  fy, Real_p  fz,
-                                   Real_p  determ,
-                                   Real_p  x8n, Real_p  y8n, Real_p  z8n,
-                                   Real_p  dvdx, Real_p  dvdy, Real_p  dvdz,
-                                   Real_t hourg, RAJA::IndexSet *domElemList)
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] (int i2) {
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-      Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      Index_p elemToNode = &nodelist[8*i2];
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * ggamma[i1][0] + x8n[i3+1] * ggamma[i1][1] +
-            x8n[i3+2] * ggamma[i1][2] + x8n[i3+3] * ggamma[i1][3] +
-            x8n[i3+4] * ggamma[i1][4] + x8n[i3+5] * ggamma[i1][5] +
-            x8n[i3+6] * ggamma[i1][6] + x8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * ggamma[i1][0] + y8n[i3+1] * ggamma[i1][1] +
-            y8n[i3+2] * ggamma[i1][2] + y8n[i3+3] * ggamma[i1][3] +
-            y8n[i3+4] * ggamma[i1][4] + y8n[i3+5] * ggamma[i1][5] +
-            y8n[i3+6] * ggamma[i1][6] + y8n[i3+7] * ggamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * ggamma[i1][0] + z8n[i3+1] * ggamma[i1][1] +
-            z8n[i3+2] * ggamma[i1][2] + z8n[i3+3] * ggamma[i1][3] +
-            z8n[i3+4] * ggamma[i1][4] + z8n[i3+5] * ggamma[i1][5] +
-            z8n[i3+6] * ggamma[i1][6] + z8n[i3+7] * ggamma[i1][7];
-
-         hourgam0[i1] = ggamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = ggamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = ggamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = ggamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = ggamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = ggamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = ggamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = ggamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=ss[i2];
-      mass1=elemMass[i2];
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = xd[n0si2];
-      xd1[1] = xd[n1si2];
-      xd1[2] = xd[n2si2];
-      xd1[3] = xd[n3si2];
-      xd1[4] = xd[n4si2];
-      xd1[5] = xd[n5si2];
-      xd1[6] = xd[n6si2];
-      xd1[7] = xd[n7si2];
-
-      yd1[0] = yd[n0si2];
-      yd1[1] = yd[n1si2];
-      yd1[2] = yd[n2si2];
-      yd1[3] = yd[n3si2];
-      yd1[4] = yd[n4si2];
-      yd1[5] = yd[n5si2];
-      yd1[6] = yd[n6si2];
-      yd1[7] = yd[n7si2];
-
-      zd1[0] = zd[n0si2];
-      zd1[1] = zd[n1si2];
-      zd1[2] = zd[n2si2];
-      zd1[3] = zd[n3si2];
-      zd1[4] = zd[n4si2];
-      zd1[5] = zd[n5si2];
-      zd1[6] = zd[n6si2];
-      zd1[7] = zd[n7si2];
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      fx[n0si2] += hgfx[0];
-      fy[n0si2] += hgfy[0];
-      fz[n0si2] += hgfz[0];
-
-      fx[n1si2] += hgfx[1];
-      fy[n1si2] += hgfy[1];
-      fz[n1si2] += hgfz[1];
-
-      fx[n2si2] += hgfx[2];
-      fy[n2si2] += hgfy[2];
-      fz[n2si2] += hgfz[2];
-
-      fx[n3si2] += hgfx[3];
-      fy[n3si2] += hgfy[3];
-      fz[n3si2] += hgfz[3];
-
-      fx[n4si2] += hgfx[4];
-      fy[n4si2] += hgfy[4];
-      fz[n4si2] += hgfz[4];
-
-      fx[n5si2] += hgfx[5];
-      fy[n5si2] += hgfy[5];
-      fz[n5si2] += hgfz[5];
-
-      fx[n6si2] += hgfx[6];
-      fy[n6si2] += hgfy[6];
-      fz[n6si2] += hgfz[6];
-
-      fx[n7si2] += hgfx[7];
-      fy[n7si2] += hgfy[7];
-      fz[n7si2] += hgfz[7];
-    }
-   ) ; 
-
-}
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain *domain,
-                                  Real_p determ,
-                                  Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_p dvdx = Allocate<Real_t>(numElem8) ;
-   Real_p dvdy = Allocate<Real_t>(numElem8) ;
-   Real_p dvdz = Allocate<Real_t>(numElem8) ;
-   Real_p x8n  = Allocate<Real_t>(numElem8) ;
-   Real_p y8n  = Allocate<Real_t>(numElem8) ;
-   Real_p z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] (int idx) {
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_p elemToNode = &domain->nodelist[8*idx];
-      CollectDomainNodesToElemNodes(domain->x, domain->y, domain->z,
-                                    elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(int ii=0;ii<8;++ii){
-         int jj=8*idx+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[idx] = domain->volo[idx] * domain->v[idx];
-
-      /* Do a check for negative volumes */
-      if ( domain->v[idx] <= Real_t(0.0) ) {
-         exit(VolumeError) ;
-      }
-    }
-   ) ;
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain->nodelist,
-                                    domain->ss, domain->elemMass,
-                                    domain->xd, domain->yd, domain->zd,
-                                    domain->fx, domain->fy, domain->fz,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, domain->domElemList) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef ;
-      Real_p sigxx  = Allocate<Real_t>(numElem) ;
-      Real_p sigyy  = Allocate<Real_t>(numElem) ;
-      Real_p sigzz  = Allocate<Real_t>(numElem) ;
-      Real_p determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain->p, domain->q,
-                              sigxx, sigyy, sigzz, domain->domElemList);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain->nodelist,
-                               domain->x, domain->y, domain->z,
-                               domain->fx, domain->fy, domain->fz,
-                               sigxx, sigyy, sigzz, determ, domain->domElemList) ;
-
-      // check for negative element volume
-      RAJA::forall<elem_exec_policy>(*domain->domElemList, [=] (int k) {
-         if (determ[k] <= Real_t(0.0)) {
-            exit(VolumeError) ;
-         }
-       }
-      ) ;
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcForceForNodes(Domain *domain)
-{
-  RAJA::forall<node_exec_policy>(*domain->domNodeList, [=] (int i) {
-     domain->fx[i] = Real_t(0.0) ;
-     domain->fy[i] = Real_t(0.0) ;
-     domain->fz[i] = Real_t(0.0) ;
-   }
-  ) ;
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Real_p xdd, Real_p ydd, Real_p zdd,
-                              Real_p fx, Real_p fy, Real_p fz,
-                              Real_p nodalMass, RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>(*domNodeList, [=] (int i) {
-      xdd[i] = fx[i] / nodalMass[i];
-      ydd[i] = fy[i] / nodalMass[i];
-      zdd[i] = fz[i] / nodalMass[i];
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Real_p xdd, Real_p ydd,
-                                                 Real_p zdd, Index_p symmX,
-                                                 Index_p symmY,
-                                                 Index_p symmZ, Index_t size)
-{
-  Index_t numNodeBC = (size+1)*(size+1) ;
-
-  RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-     xdd[symmX[i]] = Real_t(0.0) ;
-     ydd[symmY[i]] = Real_t(0.0) ;
-     zdd[symmZ[i]] = Real_t(0.0) ;
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Real_p xd,  Real_p yd,  Real_p zd,
-                          Real_p xdd, Real_p ydd, Real_p zdd,
-                          const Real_t dt, const Real_t u_cut,
-                          RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = xd[i] + xdd[i] * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     xd[i] = xdtmp ;
-
-     ydtmp = yd[i] + ydd[i] * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     yd[i] = ydtmp ;
-
-     zdtmp = zd[i] + zdd[i] * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     zd[i] = zdtmp ;
-    }
-   ) ;
-
-}
-
-RAJA_STORAGE
-void CalcPositionForNodes(Real_p x,  Real_p y,  Real_p z,
-                          Real_p xd, Real_p yd, Real_p zd,
-                          const Real_t dt, RAJA::IndexSet *domNodeList)
-{
-   RAJA::forall<node_exec_policy>( *domNodeList, [=] (int i) {
-     x[i] += xd[i] * dt ;
-     y[i] += yd[i] * dt ;
-     z[i] += zd[i] * dt ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void LagrangeNodal(Domain *domain)
-{
-  const Real_t delt = domain->deltatime ;
-  Real_t u_cut = domain->u_cut ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-  CalcAccelerationForNodes(domain->xdd, domain->ydd, domain->zdd,
-                           domain->fx, domain->fy, domain->fz,
-                           domain->nodalMass, domain->domNodeList);
-
-  ApplyAccelerationBoundaryConditionsForNodes(domain->xdd, domain->ydd,
-                                              domain->zdd, domain->symmX,
-                                              domain->symmY, domain->symmZ,
-                                              domain->sizeX);
-
-  CalcVelocityForNodes( domain->xd,  domain->yd,  domain->zd,
-                        domain->xdd, domain->ydd, domain->zdd,
-                        delt, u_cut, domain->domNodeList) ;
-
-  CalcPositionForNodes( domain->x,  domain->y,  domain->z,
-                        domain->xd, domain->yd, domain->zd,
-                        delt, domain->domNodeList );
-
-  return;
-}
-
-RAJA_STORAGE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-RAJA_STORAGE
-Real_t CalcElemVolume(
-                       const_Real_p x, const_Real_p y, const_Real_p z
-                     )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-RAJA_STORAGE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-RAJA_STORAGE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-RAJA_STORAGE
-void CalcElemVelocityGrandient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-RAJA_STORAGE
-void CalcKinematicsForElems( Index_p nodelist,
-                             Real_p x,   Real_p y,   Real_p z,
-                             Real_p xd,  Real_p yd,  Real_p zd,
-                             Real_p dxx, Real_p dyy, Real_p dzz,
-                             Real_p v, Real_p volo,
-                             Real_p vnew, Real_p delv, Real_p arealg,
-                             Real_t deltaTime, RAJA::IndexSet *domElemList )
-{
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(*domElemList, [=] (int k) {
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_p elemToNode = &nodelist[8*k] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = x[gnode];
-      y_local[lnode] = y[gnode];
-      z_local[lnode] = z[gnode];
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / volo[k] ;
-    vnew[k] = relativeVolume ;
-    delv[k] = relativeVolume - v[k] ;
-
-    // set characteristic length
-    arealg[k] = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = xd[gnode];
-      yd_local[lnode] = yd[gnode];
-      zd_local[lnode] = zd[gnode];
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGrandient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    dxx[k] = D[0];
-    dyy[k] = D[1];
-    dzz[k] = D[2];
-   }
-  ) ;
-}
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain *domain)
-{
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime ;
-
-      domain->dxx  = Allocate<Real_t>(numElem) ; /* principal strains */
-      domain->dyy  = Allocate<Real_t>(numElem) ;
-      domain->dzz  = Allocate<Real_t>(numElem) ;
-
-      CalcKinematicsForElems(domain->nodelist,
-                             domain->x, domain->y, domain->z,
-                             domain->xd, domain->yd, domain->zd,
-                             domain->dxx, domain->dyy, domain->dzz,
-                             domain->v, domain->volo,
-                             domain->vnew, domain->delv, domain->arealg,
-                             deltatime, domain->domElemList) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] (int k) {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = domain->dxx[k] + domain->dyy[k] + domain->dzz[k] ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        domain->vdov[k] = vdov ;
-        domain->dxx[k] -= vdovthird ;
-        domain->dyy[k] -= vdovthird ;
-        domain->dzz[k] -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-        if (domain->vnew[k] <= Real_t(0.0))
-        {
-           exit(VolumeError) ;
-        }
-       }
-      ) ;
-
-      Release(&domain->dzz) ;
-      Release(&domain->dyy) ;
-      Release(&domain->dxx) ;
-   }
-}
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Real_p x,  Real_p y,  Real_p z,
-                                     Real_p xd, Real_p yd, Real_p zd,
-                                     Real_p volo, Real_p vnew,
-                                     Real_p delv_xi,
-                                     Real_p delv_eta,
-                                     Real_p delv_zeta,
-                                     Real_p delx_xi,
-                                     Real_p delx_eta,
-                                     Real_p delx_zeta,
-                                     Index_p nodelist,
-                                     RAJA::IndexSet *domElemList)
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-
-   RAJA::forall<elem_exec_policy>(*domElemList, [=] (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      Index_p elemToNode = &nodelist[8*i];
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = x[n0] ;
-      Real_t x1 = x[n1] ;
-      Real_t x2 = x[n2] ;
-      Real_t x3 = x[n3] ;
-      Real_t x4 = x[n4] ;
-      Real_t x5 = x[n5] ;
-      Real_t x6 = x[n6] ;
-      Real_t x7 = x[n7] ;
-
-      Real_t y0 = y[n0] ;
-      Real_t y1 = y[n1] ;
-      Real_t y2 = y[n2] ;
-      Real_t y3 = y[n3] ;
-      Real_t y4 = y[n4] ;
-      Real_t y5 = y[n5] ;
-      Real_t y6 = y[n6] ;
-      Real_t y7 = y[n7] ;
-
-      Real_t z0 = z[n0] ;
-      Real_t z1 = z[n1] ;
-      Real_t z2 = z[n2] ;
-      Real_t z3 = z[n3] ;
-      Real_t z4 = z[n4] ;
-      Real_t z5 = z[n5] ;
-      Real_t z6 = z[n6] ;
-      Real_t z7 = z[n7] ;
-
-      Real_t xv0 = xd[n0] ;
-      Real_t xv1 = xd[n1] ;
-      Real_t xv2 = xd[n2] ;
-      Real_t xv3 = xd[n3] ;
-      Real_t xv4 = xd[n4] ;
-      Real_t xv5 = xd[n5] ;
-      Real_t xv6 = xd[n6] ;
-      Real_t xv7 = xd[n7] ;
-
-      Real_t yv0 = yd[n0] ;
-      Real_t yv1 = yd[n1] ;
-      Real_t yv2 = yd[n2] ;
-      Real_t yv3 = yd[n3] ;
-      Real_t yv4 = yd[n4] ;
-      Real_t yv5 = yd[n5] ;
-      Real_t yv6 = yd[n6] ;
-      Real_t yv7 = yd[n7] ;
-
-      Real_t zv0 = zd[n0] ;
-      Real_t zv1 = zd[n1] ;
-      Real_t zv2 = zd[n2] ;
-      Real_t zv3 = zd[n3] ;
-      Real_t zv4 = zd[n4] ;
-      Real_t zv5 = zd[n5] ;
-      Real_t zv6 = zd[n6] ;
-      Real_t zv7 = zd[n7] ;
-
-      Real_t vol = volo[i]*vnew[i] ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
-    }
-   ) ;
-
-#undef SUM4
-}
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(
-                           RAJA::IndexSet *matElemList, Index_p elemBC,
-                           Index_p lxim,   Index_p lxip,
-                           Index_p letam,  Index_p letap,
-                           Index_p lzetam, Index_p lzetap,
-                           Real_p delv_xi,Real_p delv_eta,Real_p delv_zeta,
-                           Real_p delx_xi,Real_p delx_eta,Real_p delx_zeta,
-                           Real_p vdov, Real_p volo, Real_p vnew,
-                           Real_p elemMass, Real_p qq, Real_p ql,
-                           Real_t qlc_monoq, Real_t qqc_monoq,
-                           Real_t monoq_limiter_mult,
-                           Real_t monoq_max_slope,
-                           Real_t ptiny )
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = elemBC[i] ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( delv_xi[i] + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = delv_xi[lxim[i]] ; break ;
-         case XI_M_SYMM: delvm = delv_xi[i] ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = delv_xi[lxip[i]] ; break ;
-         case XI_P_SYMM: delvp = delv_xi[i] ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:        /* ERROR */ ;              break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( delv_eta[i] + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = delv_eta[letam[i]] ; break ;
-         case ETA_M_SYMM: delvm = delv_eta[i] ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = delv_eta[letap[i]] ; break ;
-         case ETA_P_SYMM: delvp = delv_eta[i] ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:         /* ERROR */ ;                break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( delv_zeta[i] + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = delv_zeta[lzetam[i]] ; break ;
-         case ZETA_M_SYMM: delvm = delv_zeta[i] ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = delv_zeta[lzetap[i]] ; break ;
-         case ZETA_P_SYMM: delvp = delv_zeta[i] ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          /* ERROR */ ;                  break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( vdov[i] > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = delv_xi[i]   * delx_xi[i]   ;
-         Real_t delvxeta  = delv_eta[i]  * delx_eta[i]  ;
-         Real_t delvxzeta = delv_zeta[i] * delx_zeta[i] ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = elemMass[i] / (volo[i] * vnew[i]) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      qq[i] = qquad ;
-      ql[i] = qlin  ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain *domain)
-{  
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t numElem = domain->numElem ;
-   if (numElem > 0) {
-      //
-      // initialize parameters
-      // 
-      const Real_t ptiny = Real_t(1.e-36) ;
-
-      CalcMonotonicQRegionForElems(
-                           domain->matElemList, domain->elemBC,
-                           domain->lxim,   domain->lxip,
-                           domain->letam,  domain->letap,
-                           domain->lzetam, domain->lzetap,
-                           domain->delv_xi,domain->delv_eta,domain->delv_zeta,
-                           domain->delx_xi,domain->delx_eta,domain->delx_zeta,
-                           domain->vdov, domain->volo, domain->vnew,
-                           domain->elemMass, domain->qq, domain->ql,
-                           domain->qlc_monoq, domain->qqc_monoq,
-                           domain->monoq_limiter_mult,
-                           domain->monoq_max_slope,
-                           ptiny );
-   }
-}
-
-RAJA_STORAGE
-void CalcQForElems(Domain *domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem ;
-
-   if (numElem != 0) {
-      /* allocate domain length arrays */
-
-      domain->delv_xi = Allocate<Real_t>(numElem) ;   /* velocity gradient */
-      domain->delv_eta = Allocate<Real_t>(numElem) ;
-      domain->delv_zeta = Allocate<Real_t>(numElem) ;
-
-      domain->delx_xi = Allocate<Real_t>(numElem) ;   /* position gradient */
-      domain->delx_eta = Allocate<Real_t>(numElem) ;
-      domain->delx_zeta = Allocate<Real_t>(numElem) ;
-
-      /* Calculate velocity gradients, applied at the domain level */
-      CalcMonotonicQGradientsForElems(domain->x,  domain->y,  domain->z,
-                                      domain->xd, domain->yd, domain->zd,
-                                      domain->volo, domain->vnew,
-                                      domain->delv_xi,
-                                      domain->delv_eta,
-                                      domain->delv_zeta,
-                                      domain->delx_xi,
-                                      domain->delx_eta,
-                                      domain->delx_zeta,
-                                      domain->nodelist,
-                                      domain->domElemList) ;
-
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      /* This will be applied at the region level */
-      CalcMonotonicQForElems(domain) ;
-
-      /* release domain length arrays */
-
-      Release(&domain->delx_zeta) ;
-      Release(&domain->delx_eta) ;
-      Release(&domain->delx_xi) ;
-
-      Release(&domain->delv_zeta) ;
-      Release(&domain->delv_eta) ;
-      Release(&domain->delv_xi) ;
-
-      /* Don't allow excessive artificial viscosity */
-      Real_t qstop = domain->qstop ;
-      int id = -1; 
-
-      // RAJA::forall<elem_exec_policy>( *domain->domElemList, [=] (int i) {
-      //    if ( domain->q[i] > qstop ) {
-      //       id = i ;
-      //       // break ;
-      //    }
-      //  }
-      // ) ;
-
-      if(id >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_p p_new, Real_p bvc,
-                          Real_p pbvc, Real_p e_old,
-                          Real_p compression, Real_p vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          RAJA::IndexSet *matElemList)
-{
-   const Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void CalcEnergyForElems(Real_p p_new, Real_p e_new, Real_p q_new,
-                        Real_p bvc, Real_p pbvc,
-                        Real_p p_old, Real_p e_old, Real_p q_old,
-                        Real_p compression, Real_p compHalfStep,
-                        Real_p vnewc, Real_p work, Real_p delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_p qq_old, Real_p ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        RAJA::IndexSet *matElemList,
-                        Index_t length)
-{
-   const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-   Real_p pHalfStep = Allocate<Real_t>(length) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * (delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) + work[i] ) ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-    }
-   ) ;
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, matElemList);
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int i) {
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-    }
-   ) ;
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                            Real_p vnewc, Real_t rho0, Real_p enewc,
-                            Real_p pnewc, Real_p pbvc,
-                            Real_p bvc, Real_t ss4o3)
-{
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] (int iz) {
-      Real_t ssTmp = (pbvc[iz] * enewc[iz] + vnewc[iz] * vnewc[iz] *
-                 bvc[iz] * pnewc[iz]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      ss[iz] = ssTmp ;
-    }
-   ) ;
-}
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain *domain, Real_p vnewc, Index_t numElem)
-{
-   Real_t  e_cut = domain->e_cut ;
-   Real_t  p_cut = domain->p_cut ;
-   Real_t  ss4o3 = domain->ss4o3 ;
-   Real_t  q_cut = domain->q_cut ;
-
-   Real_t eosvmax = domain->eosvmax ;
-   Real_t eosvmin = domain->eosvmin ;
-   Real_t pmin    = domain->pmin ;
-   Real_t emin    = domain->emin ;
-   Real_t rho0    = domain->refdens ;
-
-   /* allocate *domain length* arrays.  */
-   /* wastes memory, but allows us to get */
-   /* around a "temporary workset" issue */
-   /* we have not yet addressed. */
-   Real_p delvc = domain->delv ;
-   Real_p p_old = Allocate<Real_t>(numElem) ;
-   Real_p compression = Allocate<Real_t>(numElem) ;
-   Real_p compHalfStep = Allocate<Real_t>(numElem) ;
-   Real_p work = Allocate<Real_t>(numElem) ;
-   Real_p p_new = Allocate<Real_t>(numElem) ;
-   Real_p e_new = Allocate<Real_t>(numElem) ;
-   Real_p q_new = Allocate<Real_t>(numElem) ;
-   Real_p bvc = Allocate<Real_t>(numElem) ;
-   Real_p pbvc = Allocate<Real_t>(numElem) ;
-
-   /* compress data, minimal set */
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-      p_old[zidx] = domain->p[zidx] ;
-    }
-   ) ;
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-      Real_t vchalf ;
-      compression[zidx] = Real_t(1.) / vnewc[zidx] - Real_t(1.);
-      vchalf = vnewc[zidx] - delvc[zidx] * Real_t(.5);
-      compHalfStep[zidx] = Real_t(1.) / vchalf - Real_t(1.);
-    }
-   ) ;
-
-   /* Check for v > eosvmax or v < eosvmin */
-   if ( eosvmin != Real_t(0.) ) {
-      RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-         if (vnewc[zidx] <= eosvmin) { /* impossible due to calling func? */
-            compHalfStep[zidx] = compression[zidx] ;
-         }
-       }
-      ) ;
-   }
-   if ( eosvmax != Real_t(0.) ) {
-      RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-         if (vnewc[zidx] >= eosvmax) { /* impossible due to calling func? */
-            p_old[zidx]        = Real_t(0.) ;
-            compression[zidx]  = Real_t(0.) ;
-            compHalfStep[zidx] = Real_t(0.) ;
-         }
-       }
-      ) ;
-   }
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-      work[zidx] = Real_t(0.) ; 
-    }
-   ) ;
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, domain->e,  domain->q, compression, compHalfStep,
-                 vnewc, work,  delvc, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 domain->qq, domain->ql, rho0, eosvmax,
-                 domain->matElemList, numElem);
-
-
-   RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zidx) {
-      domain->p[zidx] = p_new[zidx] ;
-      domain->e[zidx] = e_new[zidx] ;
-      domain->q[zidx] = q_new[zidx] ;
-    }
-   ) ;
-
-   CalcSoundSpeedForElems(domain->matElemList, domain->ss,
-             vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&p_old) ;
-}
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain *domain)
-{
-  Index_t numElem = domain->numElem ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin ;
-    Real_t eosvmax = domain->eosvmax ;
-
-    /* create a domain length (not material length) temporary */
-    /* we are assuming here that the number of dense ranges is */
-    /* much greater than the number of sigletons.  We are also */
-    /* assuming it is ok to allocate a domain length temporary */
-    /* rather than a material length temporary. */
-
-    Real_p vnewc = Allocate<Real_t>(numElem) ;
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zn) {
-       vnewc[zn] = domain->vnew[zn] ;
-     }
-    ) ;
-
-    if (eosvmin != Real_t(0.)) {
-       RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zn) {
-          if (vnewc[zn] < eosvmin)
-             vnewc[zn] = eosvmin ;
-        }
-       ) ;
-    }
-
-    if (eosvmax != Real_t(0.)) {
-       RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zn) {
-          if (vnewc[zn] > eosvmax)
-             vnewc[zn] = eosvmax ;
-        }
-       ) ;
-    }
-
-    RAJA::forall<mat_exec_policy>( *domain->matElemList, [=] (int zn) {
-       Real_t vc = domain->v[zn] ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = eosvmin ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = eosvmax ;
-       }
-       if (vc <= 0.) {
-          exit(VolumeError) ;
-       }
-     }
-    ) ;
-
-    EvalEOSForElems(domain, vnewc, numElem);
-
-    Release(&vnewc) ;
-
-  }
-}
-
-RAJA_STORAGE
-void UpdateVolumesForElems(RAJA::IndexSet *domElemList,
-                           Real_p vnew, Real_p v, Real_t v_cut)
-{
-   RAJA::forall<elem_exec_policy>( *domElemList, [=] (int i) {
-      Real_t tmpV = vnew[i] ;
-
-     if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-         tmpV = Real_t(1.0) ;
-
-     v[i] = tmpV ;
-    }
-   ) ;
-
-   return ;
-}
-
-RAJA_STORAGE
-void LagrangeElements(Domain *domain, Index_t numElem)
-{
-  /* new relative volume -- temporary */
-  domain->vnew = Allocate<Real_t>(numElem) ;
-
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain->domElemList,
-                        domain->vnew, domain->v, domain->v_cut) ;
-
-  Release(&domain->vnew) ;
-}
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(RAJA::IndexSet *matElemList, Real_p ss,
-                                   Real_p vdov, Real_p arealg,
-                                   Real_t qqc, Real_t *dtcourant)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(Real_t(1.0e+20)) ;
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-      Real_t dtf = ss[indx] * ss[indx] ;
-      Real_t dtf_cmp ;
-
-      if ( vdov[indx] < Real_t(0.) ) {
-         dtf += qqc2 * arealg[indx] * arealg[indx] * vdov[indx] * vdov[indx] ;
-      }
-
-      dtf_cmp = (vdov[indx] != Real_t(0.))
-              ?  arealg[indx] / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      *dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(RAJA::IndexSet *matElemList, Real_p vdov,
-                                 Real_t dvovmax, Real_t *dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(Real_t(1.0e+20)) ;
-
-   RAJA::forall<mat_exec_policy>( *matElemList, [=] RAJA_DEVICE (int indx) {
-
-      Real_t dtvov_cmp = (vdov[indx] != Real_t(0.))
-                       ? (dvovmax / (FABS(vdov[indx])+Real_t(1.e-20)))
-                       : Real_t(1.0e+10) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      *dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain *domain) {
-   /* evaluate time constraint */
-   /* normally,  this call is on a per region basis */
-   CalcCourantConstraintForElems(domain->matElemList, domain->ss,
-                                 domain->vdov, domain->arealg,
-                                 domain->qqc, &domain->dtcourant) ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems(domain->matElemList, domain->vdov,
-                               domain->dvovmax, &domain->dthydro) ;
-}
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain *domain)
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem);
-
-   CalcTimeConstraintsForElems(domain);
-
-}
-
-int main(int argc, char *argv[])
-{
-
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   struct Domain domain ;
-   int maxIter = 1024*1024 ;
-
-   Index_t edgeElems = lulesh_edge_elems ;
-
-   for (int i=1; i<argc; ++i) {
-      if (strcmp(argv[i], "-p") == 0) {
-         show_run_progress = true ;
-      }
-      else if (strcmp(argv[i], "-i") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            maxIter = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Iteration (-i) option has bad argument -- ignoring\n") ;
-         }
-      }
-      else if (strcmp(argv[i], "-s") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            edgeElems = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Size (-s) option has bad argument -- ignoring\n") ;
-         }
-      }
-   }
-
-   Index_t edgeNodes = edgeElems+1 ;
-
-   /****************************/
-   /*  Print run parameters    */
-   /****************************/
-   printf("LULESH parallel run parameters:\n");
-   printf("\t stop time = %e\n", double(lulesh_stop_time)) ;
-   if ( lulesh_time_step > 0 ) {
-     printf("\t Fixed time step = %e\n", double(lulesh_time_step)) ;
-   } else {
-     printf("\t CFL-controlled: initial time step = %e\n",
-            double(-lulesh_time_step)) ;
-   }
-   printf("\t Mesh size = %i x %i x %i\n",
-          edgeElems, edgeElems, edgeElems) ;
-
-   switch (lulesh_tiling_mode) {
-      case Canonical:
-      {
-         printf("\t Tiling mode is 'Canonical'\n");
-         break;
-      }
-      case Tiled_Index:
-      {
-         printf("\t Tiling mode is 'Tiled_Index'\n");
-         break;
-      }
-      case Tiled_Order:
-      {
-         printf("\t Tiling mode is 'Tiled_Order'\n");
-         break;
-      }
-      case Tiled_LockFree:
-      {
-         printf("\t Tiling mode is 'Lock-free chunk'\n");
-         break;
-      }
-      default :
-      {
-         printf("Unknown tiling mode!!!\n");
-      }
-   }
-
-   if (lulesh_tiling_mode != Canonical) {
-      printf("\t Mesh tiling = %i x %i x %i\n",
-             lulesh_xtile, lulesh_ytile, lulesh_ztile) ;
-   }
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   domain.sizeX = edgeElems ;
-   domain.sizeY = edgeElems ;
-   domain.sizeZ = edgeElems ;
-   domain.numElem = edgeElems*edgeElems*edgeElems ;
-
-   domain.numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   Index_t domElems = domain.numElem ;
-   Index_t domNodes = domain.numNode ;
-
-   /*************************/
-   /* allocate field memory */
-   /*************************/
-   
-   /*****************/
-   /* Elem-centered */
-   /*****************/
-
-   /* elemToNode connectivity */
-   domain.nodelist = Allocate<Index_t>(8*domElems) ;
-
-   /* elem connectivity through face */
-   domain.lxim = Allocate<Index_t>(domElems) ;
-   domain.lxip = Allocate<Index_t>(domElems)  ;
-   domain.letam = Allocate<Index_t>(domElems) ;
-   domain.letap = Allocate<Index_t>(domElems) ;
-   domain.lzetam = Allocate<Index_t>(domElems) ;
-   domain.lzetap = Allocate<Index_t>(domElems) ;
-
-   /* elem face symm/free-surface flag */
-   domain.elemBC = Allocate<Int_t>(domElems) ;
-
-   domain.e = Allocate<Real_t>(domElems) ;   /* energy */
-   domain.p = Allocate<Real_t>(domElems) ;   /* pressure */
-
-   domain.q = Allocate<Real_t>(domElems) ;   /* q */
-   domain.ql = Allocate<Real_t>(domElems) ;  /* linear term for q */
-   domain.qq = Allocate<Real_t>(domElems) ;  /* quadratic term for q */
-
-   domain.v = Allocate<Real_t>(domElems) ;     /* relative volume */
-   domain.volo = Allocate<Real_t>(domElems) ;  /* reference volume */
-   domain.delv = Allocate<Real_t>(domElems) ;  /* m_vnew - m_v */
-   domain.vdov = Allocate<Real_t>(domElems) ;  /* volume deriv over volume */
-
-   /* elem characteristic length */
-   domain.arealg = Allocate<Real_t>(domElems) ;
-
-   domain.ss = Allocate<Real_t>(domElems) ;    /* "sound speed" */
-
-   domain.elemMass = Allocate<Real_t>(domElems) ;  /* mass */
-
-   /*****************/
-   /* Node-centered */
-   /*****************/
-
-   domain.x = Allocate<Real_t>(domNodes) ;  /* coordinates */
-   domain.y = Allocate<Real_t>(domNodes)  ;
-   domain.z = Allocate<Real_t>(domNodes)  ;
-
-   domain.xd = Allocate<Real_t>(domNodes) ; /* velocities */
-   domain.yd = Allocate<Real_t>(domNodes)  ;
-   domain.zd = Allocate<Real_t>(domNodes) ;
-
-   domain.xdd = Allocate<Real_t>(domNodes)  ; /* accelerations */
-   domain.ydd = Allocate<Real_t>(domNodes)  ;
-   domain.zdd = Allocate<Real_t>(domNodes)  ;
-
-   domain.fx = Allocate<Real_t>(domNodes) ;  /* forces */
-   domain.fy = Allocate<Real_t>(domNodes) ;
-   domain.fz = Allocate<Real_t>(domNodes) ;
-
-   domain.nodalMass = Allocate<Real_t>(domNodes) ;  /* mass */
-
-   /* Boundary nodesets */
-
-   domain.symmX = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-   domain.symmY = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-   domain.symmZ = Allocate<Index_t>(edgeNodes*edgeNodes) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.e[i] = Real_t(0.0) ;
-      domain.p[i] = Real_t(0.0) ;
-      domain.q[i] = Real_t(0.0) ;
-      domain.v[i] = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xd[i] = Real_t(0.0) ;
-      domain.yd[i] = Real_t(0.0) ;
-      domain.zd[i] = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xdd[i] = Real_t(0.0) ;
-      domain.ydd[i] = Real_t(0.0) ;
-      domain.zdd[i] = Real_t(0.0) ;
-   }
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            domain.x[nidx] = tx ;
-            domain.y[nidx] = ty ;
-            domain.z[nidx] = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_p localNode = &domain.nodelist[8*zidx] ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   /* initialize material parameters */
-   domain.dtfixed = Real_t(lulesh_time_step) ;
-   domain.deltatime = Real_t(1.0e-7) ;
-   domain.deltatimemultlb = Real_t(1.1) ;
-   domain.deltatimemultub = Real_t(1.2) ;
-   domain.stoptime  = Real_t(lulesh_stop_time) ;
-   domain.dtcourant = Real_t(1.0e+20) ;
-   domain.dthydro   = Real_t(1.0e+20) ;
-   domain.dtmax     = Real_t(1.0e-2) ;
-   domain.time    = Real_t(0.) ;
-   domain.cycle   = 0 ;
-
-   domain.e_cut = Real_t(1.0e-7) ;
-   domain.p_cut = Real_t(1.0e-7) ;
-   domain.q_cut = Real_t(1.0e-7) ;
-   domain.u_cut = Real_t(1.0e-7) ;
-   domain.v_cut = Real_t(1.0e-10) ;
-
-   domain.hgcoef      = Real_t(3.0) ;
-   domain.ss4o3       = Real_t(4.0)/Real_t(3.0) ;
-
-   domain.qstop              =  Real_t(1.0e+12) ;
-   domain.monoq_max_slope    =  Real_t(1.0) ;
-   domain.monoq_limiter_mult =  Real_t(2.0) ;
-   domain.qlc_monoq          = Real_t(0.5) ;
-   domain.qqc_monoq          = Real_t(2.0)/Real_t(3.0) ;
-   domain.qqc                = Real_t(2.0) ;
-
-   domain.pmin =  Real_t(0.) ;
-   domain.emin = Real_t(-1.0e+15) ;
-
-   domain.dvovmax =  Real_t(0.1) ;
-
-   domain.eosvmax =  Real_t(1.0e+9) ;
-   domain.eosvmin =  Real_t(1.0e-9) ;
-
-   domain.refdens =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.nodalMass[i] = 0.0 ;
-   }
-
-   for (Index_t i=0; i<domElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_p elemToNode = &domain.nodelist[8*i] ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = domain.x[gnode];
-        y_local[lnode] = domain.y[gnode];
-        z_local[lnode] = domain.z[gnode];
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      domain.volo[i] = volume ;
-      domain.elemMass[i] = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         domain.nodalMass[idx] += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* deposit energy */
-   domain.e[0] = Real_t(3.948746e+7) ;
-
-   /* set up symmetry nodesets */
-   nidx = 0 ;
-   for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      Index_t rowInc   = i*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-         domain.symmX[nidx] = planeInc + j*edgeNodes ;
-         domain.symmY[nidx] = planeInc + j ;
-         domain.symmZ[nidx] = rowInc   + j ;
-         ++nidx ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   domain.lxim[0] = 0 ;
-   for (Index_t i=1; i<domElems; ++i) {
-      domain.lxim[i]   = i-1 ;
-      domain.lxip[i-1] = i ;
-   }
-   domain.lxip[domElems-1] = domElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      domain.letam[i] = i ; 
-      domain.letap[domElems-edgeElems+i] = domElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<domElems; ++i) {
-      domain.letam[i] = i-edgeElems ;
-      domain.letap[i-edgeElems] = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      domain.lzetam[i] = i ;
-      domain.lzetap[domElems-edgeElems*edgeElems+i] = domElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<domElems; ++i) {
-      domain.lzetam[i] = i - edgeElems*edgeElems ;
-      domain.lzetap[i-edgeElems*edgeElems] = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.elemBC[i] = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         domain.elemBC[planeInc+j*edgeElems] |= XI_M_SYMM ;
-         domain.elemBC[planeInc+j*edgeElems+edgeElems-1] |= XI_P_FREE ;
-         domain.elemBC[planeInc+j] |= ETA_M_SYMM ;
-         domain.elemBC[planeInc+j+edgeElems*edgeElems-edgeElems] |= ETA_P_FREE ;
-         domain.elemBC[rowInc+j] |= ZETA_M_SYMM ;
-         domain.elemBC[rowInc+j+domElems-edgeElems*edgeElems] |= ZETA_P_FREE ;
-      }
-   }
-
-   /* Create domain Index Sets */
-
-   /* always leave the nodes in a canonical ordering */
-   domain.domNodeList = new RAJA::IndexSet() ;
-   domain.domNodeList->push_back( RAJA::RangeSegment(0, domNodes) );
-
-   domain.domElemList = new RAJA::IndexSet() ;
-   domain.matElemList = new RAJA::IndexSet() ;
-
-   const Index_t xtile = lulesh_xtile ;
-   const Index_t ytile = lulesh_ytile ;
-   const Index_t ztile = lulesh_ztile ;
-
-#if 0
-   if ( lulesh_tiling_mode == Tiled_LockFree ) {
-      printf("Tiled_LockFree ordering not implemented!!! Canonical will be used.\n");
-      lulesh_tiling_mode = Canonical; 
-   }
-#endif
-
-   switch (lulesh_tiling_mode) {
-
-      case Canonical:
-      {
-         domain.domElemList->push_back( RAJA::RangeSegment(0, domElems) );
-
-         /* Create a material IndexSet (entire domain same material for now) */
-         domain.matElemList->push_back( RAJA::RangeSegment(0, domElems) );
-      }
-      break ;
-
-      case Tiled_Index:
-      {
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize = 
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-#if 0
-                  Index_t tileIdx[tileSize] ;
-#else
-                  Index_t* tileIdx = new Index_t[tileSize] ;
-#endif
-                  Index_t idx = 0 ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           tileIdx[idx++] = 
-                              (plane*edgeElems + row)*edgeElems + col ;
-                        }
-                     }
-                  }
-                  domain.domElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-                  domain.matElemList->push_back( RAJA::ListSegment(tileIdx, tileSize) );
-
-#if 0
-#else
-                  delete [] tileIdx ;
-#endif
-               }
-            }
-         }
-      }
-      break ;
-
-      case Tiled_Order:
-      {
-         Index_t idx = 0 ;
-         Index_t perm[domElems] ;
-         Index_t iperm[domElems] ; /* inverse permutation */
-         Index_t tileBegin = 0 ;
-         for (Index_t zt = 0; zt < ztile; ++zt) {
-            for (Index_t yt = 0; yt < ytile; ++yt) {
-               for (Index_t xt = 0; xt < xtile; ++xt) {
-                  Index_t xbegin =  edgeElems*( xt )/xtile ;
-                  Index_t xend   =  edgeElems*(xt+1)/xtile ;
-                  Index_t ybegin =  edgeElems*( yt )/ytile ;
-                  Index_t yend   =  edgeElems*(yt+1)/ytile ;
-                  Index_t zbegin =  edgeElems*( zt )/ztile ;
-                  Index_t zend   =  edgeElems*(zt+1)/ztile ;
-                  Index_t tileSize = 
-                     (xend - xbegin)*(yend-ybegin)*(zend-zbegin) ;
-
-                  for (Index_t plane = zbegin; plane<zend; ++plane) {
-                     for (Index_t row = ybegin; row<yend; ++row) {
-                        for (Index_t col = xbegin; col<xend; ++col) {
-                           perm[idx] = 
-                              (plane*edgeElems + row)*edgeElems + col ;
-                           iperm[perm[idx]] = idx ;
-                           ++idx ;
-                        }
-                     }
-                  }
-                  Index_t tileEnd = tileBegin + tileSize ;
-                  domain.domElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  domain.matElemList->push_back( RAJA::RangeSegment(tileBegin, tileEnd) );
-                  tileBegin = tileEnd ;
-               }
-            }
-         }
-         /* permute nodelist connectivity */
-         {
-            Index_t tmp[8*domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               for (Index_t j=0; j<8; ++j) {
-                  tmp[i*8+j] = domain.nodelist[perm[i]*8+j] ;
-               }
-            }
-            for (Index_t i=0; i<8*domElems; ++i) {
-               domain.nodelist[i] = tmp[i] ;
-            }
-         }
-         /* permute volo */
-         {
-            Real_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.volo[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.volo[i] = tmp[i] ;
-            }
-         }
-         /* permute elemMass */
-         {
-            Real_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.elemMass[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.elemMass[i] = tmp[i] ;
-            }
-         }
-         /* permute lxim, lxip, letam, letap, lzetam, lzetap */
-         {
-            Index_t tmp[6*domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i*6+0] = iperm[domain.lxim[perm[i]]] ;
-               tmp[i*6+1] = iperm[domain.lxip[perm[i]]] ;
-               tmp[i*6+2] = iperm[domain.letam[perm[i]]] ;
-               tmp[i*6+3] = iperm[domain.letap[perm[i]]] ;
-               tmp[i*6+4] = iperm[domain.lzetam[perm[i]]] ;
-               tmp[i*6+5] = iperm[domain.lzetap[perm[i]]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.lxim[i] = tmp[i*6+0] ;
-               domain.lxip[i] = tmp[i*6+1] ;
-               domain.letam[i] = tmp[i*6+2] ;
-               domain.letap[i] = tmp[i*6+3] ;
-               domain.lzetam[i] = tmp[i*6+4] ;
-               domain.lzetap[i] = tmp[i*6+5] ;
-            }
-         }
-         /* permute elemBC */
-         {
-            Int_t tmp[domElems] ;
-            for (Index_t i=0; i<domElems; ++i) {
-               tmp[i] = domain.elemBC[perm[i]] ;
-            }
-            for (Index_t i=0; i<domElems; ++i) {
-               domain.elemBC[i] = tmp[i] ;
-            }
-         }
-      }
-      break ;
-
-      case Tiled_LockFree:
-      {
-         buildLockFreeBlockIndexset( *domain.domElemList,
-                                     edgeElems, edgeElems, edgeElems) ;
-
-         /* Create a material ISet (entire domain same material for now) */
-         buildLockFreeBlockIndexset ( *domain.matElemList,
-                                      edgeElems, edgeElems, edgeElems) ;
-      }
-      break;
-
-      default :
-      {
-         printf("Unknown index set ordering!!! Left undefined.\n");
-      }
-   }
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   while((domain.time < domain.stoptime) && (domain.cycle < maxIter)) {
-      TimeIncrement(&domain) ;
-      LagrangeLeapFrog(&domain) ;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-      if ( show_run_progress ) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                domain.cycle,double(domain.time), double(domain.deltatime) ) ;
-      }
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-   return 0 ;
-}
diff --git a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/sigcatch.cmd b/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/sigcatch.cmd
deleted file mode 100644
index caf3f7e19..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_RAJA-variants/sigcatch.cmd
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-#SBATCH -p pdebug
-#SBATCH --signal=USR2@120
-#SBATCH -t 5:00
-
-echo -n 'The start time is: ';date
-# trap "echo trap" SIGUSR1
-# On rzmerl, % sbatch sigcatch.cmd
-# % scancel -s USR2  <job_id>     do this repeatedly to simulate faults
-srun /g/g19/keasler/RAJA_RICH/raja/test/LULESH-v1.0/lulesh-RAJA-parallel-ft.exe
-echo -n 'The completion time is: ';date
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_baseline/CMakeLists.txt b/test/LULESH-v1.0/LULESH-v1.0_baseline/CMakeLists.txt
deleted file mode 100644
index b04acbd9a..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_baseline/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-
-if(RAJA_ENABLE_OPENMP)
-  add_executable(lulesh-OMP.exe
-    luleshOMP.cc)
-
-  target_link_libraries(lulesh-OMP.exe
-    ${RT_LIBRARIES})
-
-  add_executable(lulesh-OMP_NG.exe
-    luleshOMP_NG.cc)
-
-  target_link_libraries(
-    lulesh-OMP_NG.exe
-    ${RT_LIBRARIES})
-endif()
-
-add_executable(lulesh-serial.exe
-  lulesh.cc)
-
-target_link_libraries(
-  lulesh-serial.exe
-  ${RT_LIBRARIES})
diff --git a/test/LULESH-v1.0/LULESH-v1.0_baseline/lulesh.cc b/test/LULESH-v1.0/LULESH-v1.0_baseline/lulesh.cc
deleted file mode 100644
index ea1d02a5e..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_baseline/lulesh.cc
+++ /dev/null
@@ -1,2936 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "Timer.hxx"
-
-#define LULESH_SHOW_PROGRESS 0
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-/****************************************************/
-/* Allow flexibility for arithmetic representations */
-/****************************************************/
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Index_t ; /* array subscript and loop index */
-typedef real8  Real_t ;  /* floating point representation */
-typedef int    Int_t ;   /* integer representation */
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-/************************************************************/
-/* Allow for flexible data layout experiments by separating */
-/* array interface from underlying implementation.          */
-/************************************************************/
-
-struct Mesh {
-
-/* This first implementation allows for runnable code */
-/* and is not meant to be optimal. Final implementation */
-/* should separate declaration and allocation phases */
-/* so that allocation can be scheduled in a cache conscious */
-/* manner. */
-
-public:
-
-   /**************/
-   /* Allocation */
-   /**************/
-
-   void AllocateNodalPersistent(size_t size)
-   {
-      m_x.reserve(size) ;
-      m_y.reserve(size) ;
-      m_z.reserve(size) ;
-
-      m_xd.reserve(size) ;
-      m_yd.reserve(size) ;
-      m_zd.reserve(size) ;
-
-      m_xdd.reserve(size) ;
-      m_ydd.reserve(size) ;
-      m_zdd.reserve(size) ;
-
-      m_fx.reserve(size) ;
-      m_fy.reserve(size) ;
-      m_fz.reserve(size) ;
-
-      m_nodalMass.reserve(size) ;
-   }
-
-   void AllocateElemPersistent(size_t size)
-   {
-      m_matElemlist.reserve(size) ;
-      m_nodelist.reserve(8*size) ;
-
-      m_lxim.reserve(size) ;
-      m_lxip.reserve(size) ;
-      m_letam.reserve(size) ;
-      m_letap.reserve(size) ;
-      m_lzetam.reserve(size) ;
-      m_lzetap.reserve(size) ;
-
-      m_elemBC.reserve(size) ;
-
-      m_e.reserve(size) ;
-
-      m_p.reserve(size) ;
-      m_q.reserve(size) ;
-      m_ql.reserve(size) ;
-      m_qq.reserve(size) ;
-
-      m_v.reserve(size) ;
-      m_volo.reserve(size) ;
-      m_delv.reserve(size) ;
-      m_vdov.reserve(size) ;
-
-      m_arealg.reserve(size) ;
-   
-      m_ss.reserve(size) ;
-
-      m_elemMass.reserve(size) ;
-   }
-
-   /* Temporaries should not be initialized in bulk but */
-   /* this is a runnable placeholder for now */
-   void AllocateElemTemporary(size_t size)
-   {
-      m_dxx.reserve(size) ;
-      m_dyy.reserve(size) ;
-      m_dzz.reserve(size) ;
-
-      m_delv_xi.reserve(size) ;
-      m_delv_eta.reserve(size) ;
-      m_delv_zeta.reserve(size) ;
-
-      m_delx_xi.reserve(size) ;
-      m_delx_eta.reserve(size) ;
-      m_delx_zeta.reserve(size) ;
-
-      m_vnew.reserve(size) ;
-   }
-
-   void AllocateNodesets(size_t size)
-   {
-      m_symmX.reserve(size) ;
-      m_symmY.reserve(size) ;
-      m_symmZ.reserve(size) ;
-   }
-   
-   /**********/
-   /* Access */
-   /**********/
-
-   /* Node-centered */
-
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   Index_t&  symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t&  symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t&  symmZ(Index_t idx) { return m_symmZ[idx] ; }
-
-   /* Element-centered */
-
-   Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-   Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-   
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   /* Params */
-
-   Real_t& dtfixed()              { return m_dtfixed ; }
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-
-   Real_t& u_cut()                { return m_u_cut ; }
-   Real_t& hgcoef()               { return m_hgcoef ; }
-   Real_t& qstop()                { return m_qstop ; }
-   Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }
-   Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }
-   Real_t& e_cut()                { return m_e_cut ; }
-   Real_t& p_cut()                { return m_p_cut ; }
-   Real_t& ss4o3()                { return m_ss4o3 ; }
-   Real_t& q_cut()                { return m_q_cut ; }
-   Real_t& v_cut()                { return m_v_cut ; }
-   Real_t& qlc_monoq()            { return m_qlc_monoq ; }
-   Real_t& qqc_monoq()            { return m_qqc_monoq ; }
-   Real_t& qqc()                  { return m_qqc ; }
-   Real_t& eosvmax()              { return m_eosvmax ; }
-   Real_t& eosvmin()              { return m_eosvmin ; }
-   Real_t& pmin()                 { return m_pmin ; }
-   Real_t& emin()                 { return m_emin ; }
-   Real_t& dvovmax()              { return m_dvovmax ; }
-   Real_t& refdens()              { return m_refdens ; }
-
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-
-private:
-
-   /******************/
-   /* Implementation */
-   /******************/
-
-   /* Node-centered */
-
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   /* Element-centered */
-
-   std::vector<Index_t>  m_matElemlist ;  /* material indexset */
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   /* Parameters */
-
-   Real_t  m_dtfixed ;           /* fixed time increment */
-   Real_t  m_time ;              /* current time */
-   Real_t  m_deltatime ;         /* variable time increment */
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_stoptime ;          /* end time for simulation */
-
-   Real_t  m_u_cut ;             /* velocity tolerance */
-   Real_t  m_hgcoef ;            /* hourglass control */
-   Real_t  m_qstop ;             /* excessive q indicator */
-   Real_t  m_monoq_max_slope ;
-   Real_t  m_monoq_limiter_mult ;
-   Real_t  m_e_cut ;             /* energy tolerance */
-   Real_t  m_p_cut ;             /* pressure tolerance */
-   Real_t  m_ss4o3 ;
-   Real_t  m_q_cut ;             /* q tolerance */
-   Real_t  m_v_cut ;             /* relative volume tolerance */
-   Real_t  m_qlc_monoq ;         /* linear term coef for q */
-   Real_t  m_qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  m_qqc ;
-   Real_t  m_eosvmax ;
-   Real_t  m_eosvmin ;
-   Real_t  m_pmin ;              /* pressure floor */
-   Real_t  m_emin ;              /* energy floor */
-   Real_t  m_dvovmax ;           /* maximum allowable volume change */
-   Real_t  m_refdens ;           /* reference density */
-
-   Real_t  m_dtcourant ;         /* courant constraint */
-   Real_t  m_dthydro ;           /* volume change constraint */
-   Real_t  m_dtmax ;             /* maximum allowable time increment */
-
-   Int_t   m_cycle ;             /* iteration count for simulation */
-
-   Index_t   m_sizeX ;           /* X,Y,Z extent of this block */
-   Index_t   m_sizeY ;
-   Index_t   m_sizeZ ;
-
-   Index_t   m_numElem ;         /* Elements/Nodes in this domain */
-   Index_t   m_numNode ;
-} mesh ;
-
-
-template <typename T>
-T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-static inline
-void TimeIncrement()
-{
-   Real_t targetdt = mesh.stoptime() - mesh.time() ;
-
-   if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = mesh.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (mesh.dtcourant() < newdt) {
-         newdt = mesh.dtcourant() / Real_t(2.0) ;
-      }
-      if (mesh.dthydro() < newdt) {
-         newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < mesh.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > mesh.deltatimemultub()) {
-            newdt = olddt*mesh.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > mesh.dtmax()) {
-         newdt = mesh.dtmax() ;
-      }
-      mesh.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > mesh.deltatime()) &&
-       (targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < mesh.deltatime()) {
-      mesh.deltatime() = targetdt ;
-   }
-
-   mesh.time() += mesh.deltatime() ;
-
-   ++mesh.cycle() ;
-}
-
-static inline
-void InitStressTermsForElems(Index_t numElem, 
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-   for (Index_t i = 0 ; i < numElem ; ++i){
-      sigxx[i] =  sigyy[i] = sigzz[i] =  - mesh.p(i) - mesh.q(i) ;
-   }
-}
-
-static inline
-void CalcElemShapeFunctionDerivatives( const Real_t* const x,
-                                       const Real_t* const y,
-                                       const Real_t* const z,
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-static inline
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-static inline
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-static inline
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* const fx,
-                                  Real_t* const fy,
-                                  Real_t* const fz )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-static inline
-void IntegrateStressForElems( Index_t numElem,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ)
-{
-  Real_t B[3][8] ;// shape function derivatives
-  Real_t x_local[8] ;
-  Real_t y_local[8] ;
-  Real_t z_local[8] ;
-  Real_t fx_local[8] ;
-  Real_t fy_local[8] ;
-  Real_t fz_local[8] ;
-
-  // loop over all elements
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    const Index_t* const elemNodes = mesh.nodelist(k);
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = mesh.x(gnode);
-      y_local[lnode] = mesh.y(gnode);
-      z_local[lnode] = mesh.z(gnode);
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                         fx_local, fy_local, fz_local ) ;
-
-    // copy nodal force contributions to global force arrray.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      mesh.fx(gnode) += fx_local[lnode];
-      mesh.fy(gnode) += fy_local[lnode];
-      mesh.fz(gnode) += fz_local[lnode];
-    }
-  }
-}
-
-static inline
-void CollectDomainNodesToElemNodes(const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = mesh.x(nd0i);
-   elemX[1] = mesh.x(nd1i);
-   elemX[2] = mesh.x(nd2i);
-   elemX[3] = mesh.x(nd3i);
-   elemX[4] = mesh.x(nd4i);
-   elemX[5] = mesh.x(nd5i);
-   elemX[6] = mesh.x(nd6i);
-   elemX[7] = mesh.x(nd7i);
-
-   elemY[0] = mesh.y(nd0i);
-   elemY[1] = mesh.y(nd1i);
-   elemY[2] = mesh.y(nd2i);
-   elemY[3] = mesh.y(nd3i);
-   elemY[4] = mesh.y(nd4i);
-   elemY[5] = mesh.y(nd5i);
-   elemY[6] = mesh.y(nd6i);
-   elemY[7] = mesh.y(nd7i);
-
-   elemZ[0] = mesh.z(nd0i);
-   elemZ[1] = mesh.z(nd1i);
-   elemZ[2] = mesh.z(nd2i);
-   elemZ[3] = mesh.z(nd3i);
-   elemZ[4] = mesh.z(nd4i);
-   elemZ[5] = mesh.z(nd5i);
-   elemZ[6] = mesh.z(nd6i);
-   elemZ[7] = mesh.z(nd7i);
-
-}
-
-static inline
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-static inline
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-static inline
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,
-                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
-                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
-                              Real_t *hourgam7, Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Index_t i00=0;
-   Index_t i01=1;
-   Index_t i02=2;
-   Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-static inline
-void CalcFBHourglassForceForElems(Real_t *determ,
-            Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
-            Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
-            Real_t hourg)
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-   Index_t numElem = mesh.numElem() ;
-
-   Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-   Real_t coefficient;
-
-   Real_t  gamma[4][8];
-   Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-   Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-   Real_t xd1[8], yd1[8], zd1[8] ;
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-   for(Index_t i2=0;i2<numElem;++i2){
-      const Index_t *elemToNode = mesh.nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=mesh.ss(i2);
-      mass1=mesh.elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = mesh.xd(n0si2);
-      xd1[1] = mesh.xd(n1si2);
-      xd1[2] = mesh.xd(n2si2);
-      xd1[3] = mesh.xd(n3si2);
-      xd1[4] = mesh.xd(n4si2);
-      xd1[5] = mesh.xd(n5si2);
-      xd1[6] = mesh.xd(n6si2);
-      xd1[7] = mesh.xd(n7si2);
-
-      yd1[0] = mesh.yd(n0si2);
-      yd1[1] = mesh.yd(n1si2);
-      yd1[2] = mesh.yd(n2si2);
-      yd1[3] = mesh.yd(n3si2);
-      yd1[4] = mesh.yd(n4si2);
-      yd1[5] = mesh.yd(n5si2);
-      yd1[6] = mesh.yd(n6si2);
-      yd1[7] = mesh.yd(n7si2);
-
-      zd1[0] = mesh.zd(n0si2);
-      zd1[1] = mesh.zd(n1si2);
-      zd1[2] = mesh.zd(n2si2);
-      zd1[3] = mesh.zd(n3si2);
-      zd1[4] = mesh.zd(n4si2);
-      zd1[5] = mesh.zd(n5si2);
-      zd1[6] = mesh.zd(n6si2);
-      zd1[7] = mesh.zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      mesh.fx(n0si2) += hgfx[0];
-      mesh.fy(n0si2) += hgfy[0];
-      mesh.fz(n0si2) += hgfz[0];
-
-      mesh.fx(n1si2) += hgfx[1];
-      mesh.fy(n1si2) += hgfy[1];
-      mesh.fz(n1si2) += hgfz[1];
-
-      mesh.fx(n2si2) += hgfx[2];
-      mesh.fy(n2si2) += hgfy[2];
-      mesh.fz(n2si2) += hgfz[2];
-
-      mesh.fx(n3si2) += hgfx[3];
-      mesh.fy(n3si2) += hgfy[3];
-      mesh.fz(n3si2) += hgfz[3];
-
-      mesh.fx(n4si2) += hgfx[4];
-      mesh.fy(n4si2) += hgfy[4];
-      mesh.fz(n4si2) += hgfz[4];
-
-      mesh.fx(n5si2) += hgfx[5];
-      mesh.fy(n5si2) += hgfy[5];
-      mesh.fz(n5si2) += hgfz[5];
-
-      mesh.fx(n6si2) += hgfx[6];
-      mesh.fy(n6si2) += hgfy[6];
-      mesh.fz(n6si2) += hgfz[6];
-
-      mesh.fx(n7si2) += hgfx[7];
-      mesh.fy(n7si2) += hgfy[7];
-      mesh.fz(n7si2) += hgfz[7];
-   }
-}
-
-static inline
-void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef)
-{
-   Index_t i, ii, jj ;
-   Real_t  x1[8],  y1[8],  z1[8] ;
-   Real_t pfx[8], pfy[8], pfz[8] ;
-   Index_t numElem = mesh.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-   for (i=0 ; i<numElem ; ++i){
-
-      Index_t* elemToNode = mesh.nodelist(i);
-      CollectDomainNodesToElemNodes(elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(ii=0;ii<8;++ii){
-         jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = mesh.volo(i) * mesh.v(i);
-
-      /* Do a check for negative volumes */
-      if ( mesh.v(i) <= Real_t(0.0) ) {
-         exit(VolumeError) ;
-      }
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-static inline
-void CalcVolumeForceForElems()
-{
-   Index_t numElem = mesh.numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = mesh.hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(numElem, sigxx, sigyy, sigzz);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ) ;
-
-      // check for negative element volume
-      for ( Index_t k=0 ; k<numElem ; ++k ) {
-         if (determ[k] <= Real_t(0.0)) {
-            exit(VolumeError) ;
-         }
-      }
-
-      CalcHourglassControlForElems(determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-static inline void CalcForceForNodes()
-{
-  Index_t numNode = mesh.numNode() ;
-  for (Index_t i=0; i<numNode; ++i) {
-     mesh.fx(i) = Real_t(0.0) ;
-     mesh.fy(i) = Real_t(0.0) ;
-     mesh.fz(i) = Real_t(0.0) ;
-  }
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems() ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-static inline
-void CalcAccelerationForNodes()
-{
-   Index_t numNode = mesh.numNode() ;
-   for (Index_t i = 0; i < numNode; ++i) {
-      mesh.xdd(i) = mesh.fx(i) / mesh.nodalMass(i);
-      mesh.ydd(i) = mesh.fy(i) / mesh.nodalMass(i);
-      mesh.zdd(i) = mesh.fz(i) / mesh.nodalMass(i);
-   }
-}
-
-static inline
-void ApplyAccelerationBoundaryConditionsForNodes()
-{
-  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-     mesh.xdd(mesh.symmX(i)) = Real_t(0.0) ;
-
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-     mesh.ydd(mesh.symmY(i)) = Real_t(0.0) ;
-
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-     mesh.zdd(mesh.symmZ(i)) = Real_t(0.0) ;
-}
-
-static inline
-void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut)
-{
-   Index_t numNode = mesh.numNode() ;
-
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = mesh.xd(i) + mesh.xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     mesh.xd(i) = xdtmp ;
-
-     ydtmp = mesh.yd(i) + mesh.ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     mesh.yd(i) = ydtmp ;
-
-     zdtmp = mesh.zd(i) + mesh.zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     mesh.zd(i) = zdtmp ;
-   }
-}
-
-static inline
-void CalcPositionForNodes(const Real_t dt)
-{
-   Index_t numNode = mesh.numNode() ;
-
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     mesh.x(i) += mesh.xd(i) * dt ;
-     mesh.y(i) += mesh.yd(i) * dt ;
-     mesh.z(i) += mesh.zd(i) * dt ;
-   }
-}
-
-static inline
-void LagrangeNodal()
-{
-  const Real_t delt = mesh.deltatime() ;
-  Real_t u_cut = mesh.u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes();
-
-  CalcAccelerationForNodes();
-
-  ApplyAccelerationBoundaryConditionsForNodes();
-
-  CalcVelocityForNodes( delt, u_cut ) ;
-
-  CalcPositionForNodes( delt );
-
-  return;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-static inline
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-static inline
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-static inline
-void CalcElemVelocityGrandient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-static inline
-void CalcKinematicsForElems( Index_t numElem, Real_t dt )
-{
-  Real_t B[3][8] ; /** shape function derivatives */
-  Real_t D[6] ;
-  Real_t x_local[8] ;
-  Real_t y_local[8] ;
-  Real_t z_local[8] ;
-  Real_t xd_local[8] ;
-  Real_t yd_local[8] ;
-  Real_t zd_local[8] ;
-  Real_t detJ = Real_t(0.0) ;
-
-  // loop over all elements
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = mesh.nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = mesh.x(gnode);
-      y_local[lnode] = mesh.y(gnode);
-      z_local[lnode] = mesh.z(gnode);
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / mesh.volo(k) ;
-    mesh.vnew(k) = relativeVolume ;
-    mesh.delv(k) = relativeVolume - mesh.v(k) ;
-
-    // set characteristic length
-    mesh.arealg(k) = CalcElemCharacteristicLength(x_local,
-                                                  y_local,
-                                                  z_local,
-                                                  volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = mesh.xd(gnode);
-      yd_local[lnode] = mesh.yd(gnode);
-      zd_local[lnode] = mesh.zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * dt;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local,
-                                          y_local,
-                                          z_local,
-                                          B, &detJ );
-
-    CalcElemVelocityGrandient( xd_local,
-                               yd_local,
-                               zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    mesh.dxx(k) = D[0];
-    mesh.dyy(k) = D[1];
-    mesh.dzz(k) = D[2];
-  }
-}
-
-static inline
-void CalcLagrangeElements(Real_t deltatime)
-{
-   Index_t numElem = mesh.numElem() ;
-   if (numElem > 0) {
-      // set element connectivity array as a single dimension array. It is
-      // assumed that the array will be of length numelems*numnodesperelem.
-
-      CalcKinematicsForElems(numElem, deltatime) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-      for ( Index_t k=0 ; k<numElem ; ++k )
-      {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = mesh.dxx(k) + mesh.dyy(k) + mesh.dzz(k) ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        mesh.vdov(k) = vdov ;
-        mesh.dxx(k) -= vdovthird ;
-        mesh.dyy(k) -= vdovthird ;
-        mesh.dzz(k) -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-        if (mesh.vnew(k) <= Real_t(0.0))
-        {
-           exit(VolumeError) ;
-        }
-      }
-   }
-}
-
-static inline
-void CalcMonotonicQGradientsForElems()
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-   Index_t numElem = mesh.numElem() ;
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   for (Index_t i = 0 ; i < numElem ; ++i ) {
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = mesh.nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = mesh.x(n0) ;
-      Real_t x1 = mesh.x(n1) ;
-      Real_t x2 = mesh.x(n2) ;
-      Real_t x3 = mesh.x(n3) ;
-      Real_t x4 = mesh.x(n4) ;
-      Real_t x5 = mesh.x(n5) ;
-      Real_t x6 = mesh.x(n6) ;
-      Real_t x7 = mesh.x(n7) ;
-
-      Real_t y0 = mesh.y(n0) ;
-      Real_t y1 = mesh.y(n1) ;
-      Real_t y2 = mesh.y(n2) ;
-      Real_t y3 = mesh.y(n3) ;
-      Real_t y4 = mesh.y(n4) ;
-      Real_t y5 = mesh.y(n5) ;
-      Real_t y6 = mesh.y(n6) ;
-      Real_t y7 = mesh.y(n7) ;
-
-      Real_t z0 = mesh.z(n0) ;
-      Real_t z1 = mesh.z(n1) ;
-      Real_t z2 = mesh.z(n2) ;
-      Real_t z3 = mesh.z(n3) ;
-      Real_t z4 = mesh.z(n4) ;
-      Real_t z5 = mesh.z(n5) ;
-      Real_t z6 = mesh.z(n6) ;
-      Real_t z7 = mesh.z(n7) ;
-
-      Real_t xv0 = mesh.xd(n0) ;
-      Real_t xv1 = mesh.xd(n1) ;
-      Real_t xv2 = mesh.xd(n2) ;
-      Real_t xv3 = mesh.xd(n3) ;
-      Real_t xv4 = mesh.xd(n4) ;
-      Real_t xv5 = mesh.xd(n5) ;
-      Real_t xv6 = mesh.xd(n6) ;
-      Real_t xv7 = mesh.xd(n7) ;
-
-      Real_t yv0 = mesh.yd(n0) ;
-      Real_t yv1 = mesh.yd(n1) ;
-      Real_t yv2 = mesh.yd(n2) ;
-      Real_t yv3 = mesh.yd(n3) ;
-      Real_t yv4 = mesh.yd(n4) ;
-      Real_t yv5 = mesh.yd(n5) ;
-      Real_t yv6 = mesh.yd(n6) ;
-      Real_t yv7 = mesh.yd(n7) ;
-
-      Real_t zv0 = mesh.zd(n0) ;
-      Real_t zv1 = mesh.zd(n1) ;
-      Real_t zv2 = mesh.zd(n2) ;
-      Real_t zv3 = mesh.zd(n3) ;
-      Real_t zv4 = mesh.zd(n4) ;
-      Real_t zv5 = mesh.zd(n5) ;
-      Real_t zv6 = mesh.zd(n6) ;
-      Real_t zv7 = mesh.zd(n7) ;
-
-      Real_t vol = mesh.volo(i)*mesh.vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      mesh.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      mesh.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      mesh.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      mesh.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      mesh.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      mesh.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   }
-#undef SUM4
-}
-
-static inline
-void CalcMonotonicQRegionForElems(// parameters
-                          Real_t qlc_monoq,
-                          Real_t qqc_monoq,
-                          Real_t monoq_limiter_mult,
-                          Real_t monoq_max_slope,
-                          Real_t ptiny,
-
-                          // the elementset length
-                          Index_t elength )
-{
-   for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Index_t i = mesh.matElemlist(ielem);
-      Int_t bcMask = mesh.elemBC(i) ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( mesh.delv_xi(i) + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = mesh.delv_xi(mesh.lxim(i)) ; break ;
-         case XI_M_SYMM: delvm = mesh.delv_xi(i) ;            break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = mesh.delv_xi(mesh.lxip(i)) ; break ;
-         case XI_P_SYMM: delvp = mesh.delv_xi(i) ;            break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( mesh.delv_eta(i) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = mesh.delv_eta(mesh.letam(i)) ; break ;
-         case ETA_M_SYMM: delvm = mesh.delv_eta(i) ;             break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = mesh.delv_eta(mesh.letap(i)) ; break ;
-         case ETA_P_SYMM: delvp = mesh.delv_eta(i) ;             break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( mesh.delv_zeta(i) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = mesh.delv_zeta(mesh.lzetam(i)) ; break ;
-         case ZETA_M_SYMM: delvm = mesh.delv_zeta(i) ;              break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = mesh.delv_zeta(mesh.lzetap(i)) ; break ;
-         case ZETA_P_SYMM: delvp = mesh.delv_zeta(i) ;              break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( mesh.vdov(i) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = mesh.delv_xi(i)   * mesh.delx_xi(i)   ;
-         Real_t delvxeta  = mesh.delv_eta(i)  * mesh.delx_eta(i)  ;
-         Real_t delvxzeta = mesh.delv_zeta(i) * mesh.delx_zeta(i) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = mesh.elemMass(i) / (mesh.volo(i) * mesh.vnew(i)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      mesh.qq(i) = qquad ;
-      mesh.ql(i) = qlin  ;
-   }
-}
-
-static inline
-void CalcMonotonicQForElems()
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny        = Real_t(1.e-36) ;
-   Real_t monoq_max_slope    = mesh.monoq_max_slope() ;
-   Real_t monoq_limiter_mult = mesh.monoq_limiter_mult() ;
-
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t elength = mesh.numElem() ;
-   if (elength > 0) {
-      Real_t qlc_monoq = mesh.qlc_monoq();
-      Real_t qqc_monoq = mesh.qqc_monoq();
-      CalcMonotonicQRegionForElems(// parameters
-                           qlc_monoq,
-                           qqc_monoq,
-                           monoq_limiter_mult,
-                           monoq_max_slope,
-                           ptiny,
-
-                           // the elemset length
-                           elength );
-   }
-}
-
-static inline
-void CalcQForElems()
-{
-   Real_t qstop = mesh.qstop() ;
-   Index_t numElem = mesh.numElem() ;
-
-   //
-   // MONOTONIC Q option
-   //
-
-   /* Calculate velocity gradients */
-   CalcMonotonicQGradientsForElems() ;
-
-   /* Transfer veloctiy gradients in the first order elements */
-   /* problem->commElements->Transfer(CommElements::monoQ) ; */
-   CalcMonotonicQForElems() ;
-
-   /* Don't allow excessive artificial viscosity */
-   if (numElem != 0) {
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( mesh.q(i) > qstop ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-static inline
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length)
-{
-   Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-   for (Index_t i = 0; i < length ; ++i) {
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-   }
-
-   for (Index_t i = 0 ; i < length ; ++i){
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-   }
-}
-
-static inline
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old, Real_t* e_old, Real_t* q_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t* qq, Real_t* ql,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        Index_t length)
-{
-   const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql[i] + qq[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
-   }
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-
-      e_new[i] += Real_t(0.5) * work[i];
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-   for (Index_t i = 0 ; i < length ; ++i){
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql[i] + qq[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-   for (Index_t i = 0 ; i < length ; ++i){
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql[i] + qq[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-   }
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-static inline
-void CalcSoundSpeedForElems(Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3, Index_t nz)
-{
-   for (Index_t i = 0; i < nz ; ++i) {
-      Index_t iz = mesh.matElemlist(i);
-      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
-                 bvc[i] * pnewc[i]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp) ;
-      }
-      mesh.ss(iz) = ssTmp;
-   }
-}
-
-static inline
-void EvalEOSForElems(Real_t *vnewc, Index_t length)
-{
-   Real_t  e_cut = mesh.e_cut();
-   Real_t  p_cut = mesh.p_cut();
-   Real_t  ss4o3 = mesh.ss4o3();
-   Real_t  q_cut = mesh.q_cut();
-
-   Real_t eosvmax = mesh.eosvmax() ;
-   Real_t eosvmin = mesh.eosvmin() ;
-   Real_t pmin    = mesh.pmin() ;
-   Real_t emin    = mesh.emin() ;
-   Real_t rho0    = mesh.refdens() ;
-
-   Real_t *e_old = Allocate<Real_t>(length) ;
-   Real_t *delvc = Allocate<Real_t>(length) ;
-   Real_t *p_old = Allocate<Real_t>(length) ;
-   Real_t *q_old = Allocate<Real_t>(length) ;
-   Real_t *compression = Allocate<Real_t>(length) ;
-   Real_t *compHalfStep = Allocate<Real_t>(length) ;
-   Real_t *qq = Allocate<Real_t>(length) ;
-   Real_t *ql = Allocate<Real_t>(length) ;
-   Real_t *work = Allocate<Real_t>(length) ;
-   Real_t *p_new = Allocate<Real_t>(length) ;
-   Real_t *e_new = Allocate<Real_t>(length) ;
-   Real_t *q_new = Allocate<Real_t>(length) ;
-   Real_t *bvc = Allocate<Real_t>(length) ;
-   Real_t *pbvc = Allocate<Real_t>(length) ;
-
-   /* compress data, minimal set */
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      e_old[i] = mesh.e(zidx) ;
-   }
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      delvc[i] = mesh.delv(zidx) ;
-   }
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      p_old[i] = mesh.p(zidx) ;
-   }
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      q_old[i] = mesh.q(zidx) ;
-   }
-
-   for (Index_t i = 0; i < length ; ++i) {
-      Real_t vchalf ;
-      compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
-      vchalf = vnewc[i] - delvc[i] * Real_t(.5);
-      compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
-   }
-
-   /* Check for v > eosvmax or v < eosvmin */
-   if ( eosvmin != Real_t(0.) ) {
-      for(Index_t i=0 ; i<length ; ++i) {
-         if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
-            compHalfStep[i] = compression[i] ;
-         }
-      }
-   }
-   if ( eosvmax != Real_t(0.) ) {
-      for(Index_t i=0 ; i<length ; ++i) {
-         if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
-            p_old[i]        = Real_t(0.) ;
-            compression[i]  = Real_t(0.) ;
-            compHalfStep[i] = Real_t(0.) ;
-         }
-      }
-   }
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      qq[i] = mesh.qq(zidx) ;
-      ql[i] = mesh.ql(zidx) ;
-      work[i] = Real_t(0.) ; 
-   }
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, e_old,  q_old, compression, compHalfStep,
-                 vnewc, work,  delvc, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 qq, ql, rho0, eosvmax, length);
-
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      mesh.p(zidx) = p_new[i] ;
-   }
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      mesh.e(zidx) = e_new[i] ;
-   }
-
-   for (Index_t i=0; i<length; ++i) {
-      Index_t zidx = mesh.matElemlist(i) ;
-      mesh.q(zidx) = q_new[i] ;
-   }
-
-   CalcSoundSpeedForElems(vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3, length) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&ql) ;
-   Release(&qq) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&q_old) ;
-   Release(&p_old) ;
-   Release(&delvc) ;
-   Release(&e_old) ;
-}
-
-static inline
-void ApplyMaterialPropertiesForElems()
-{
-  Index_t length = mesh.numElem() ;
-
-  if (length != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = mesh.eosvmin() ;
-    Real_t eosvmax = mesh.eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(length) ;
-
-    for (Index_t i=0 ; i<length ; ++i) {
-       Index_t zn = mesh.matElemlist(i) ;
-       vnewc[i] = mesh.vnew(zn) ;
-    }
-
-    if (eosvmin != Real_t(0.)) {
-       for(Index_t i=0 ; i<length ; ++i) {
-          if (vnewc[i] < eosvmin)
-             vnewc[i] = eosvmin ;
-       }
-    }
-
-    if (eosvmax != Real_t(0.)) {
-       for(Index_t i=0 ; i<length ; ++i) {
-          if (vnewc[i] > eosvmax)
-             vnewc[i] = eosvmax ;
-       }
-    }
-
-    for (Index_t i=0; i<length; ++i) {
-       Index_t zn = mesh.matElemlist(i) ;
-       Real_t vc = mesh.v(zn) ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = eosvmin ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = eosvmax ;
-       }
-       if (vc <= 0.) {
-          exit(VolumeError) ;
-       }
-    }
-
-    EvalEOSForElems(vnewc, length);
-
-    Release(&vnewc) ;
-
-  }
-}
-
-static inline
-void UpdateVolumesForElems()
-{
-   Index_t numElem = mesh.numElem();
-   if (numElem != 0) {
-      Real_t v_cut = mesh.v_cut();
-
-      for(Index_t i=0 ; i<numElem ; ++i) {
-         Real_t tmpV ;
-         tmpV = mesh.vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-         mesh.v(i) = tmpV ;
-      }
-   }
-
-   return ;
-}
-
-static inline
-void LagrangeElements()
-{
-  const Real_t deltatime = mesh.deltatime() ;
-
-  CalcLagrangeElements(deltatime) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems() ;
-
-  ApplyMaterialPropertiesForElems() ;
-
-  UpdateVolumesForElems() ;
-}
-
-static inline
-void CalcCourantConstraintForElems()
-{
-   Real_t dtcourant = Real_t(1.0e+20) ;
-   Index_t   courant_elem = -1 ;
-   Real_t      qqc = mesh.qqc() ;
-   Index_t length = mesh.numElem() ;
-
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = mesh.matElemlist(i) ;
-
-      Real_t dtf = mesh.ss(indx) * mesh.ss(indx) ;
-
-      if ( mesh.vdov(indx) < Real_t(0.) ) {
-
-         dtf = dtf
-            + qqc2 * mesh.arealg(indx) * mesh.arealg(indx)
-            * mesh.vdov(indx) * mesh.vdov(indx) ;
-      }
-
-      dtf = SQRT(dtf) ;
-
-      dtf = mesh.arealg(indx) / dtf ;
-
-   /* determine minimum timestep with its corresponding elem */
-      if (mesh.vdov(indx) != Real_t(0.)) {
-         if ( dtf < dtcourant ) {
-            dtcourant = dtf ;
-            courant_elem = indx ;
-         }
-      }
-   }
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (courant_elem != -1) {
-      mesh.dtcourant() = dtcourant ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcHydroConstraintForElems()
-{
-   Real_t dthydro = Real_t(1.0e+20) ;
-   Index_t hydro_elem = -1 ;
-   Real_t dvovmax = mesh.dvovmax() ;
-   Index_t length = mesh.numElem() ;
-
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = mesh.matElemlist(i) ;
-
-      if (mesh.vdov(indx) != Real_t(0.)) {
-         Real_t dtdvov = dvovmax / (FABS(mesh.vdov(indx))+Real_t(1.e-20)) ;
-         if ( dthydro > dtdvov ) {
-            dthydro = dtdvov ;
-            hydro_elem = indx ;
-         }
-      }
-   }
-
-   if (hydro_elem != -1) {
-      mesh.dthydro() = dthydro ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcTimeConstraintsForElems() {
-   /* evaluate time constraint */
-   CalcCourantConstraintForElems() ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems() ;
-}
-
-static inline
-void LagrangeLeapFrog()
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal();
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements();
-
-   CalcTimeConstraintsForElems();
-
-   // LagrangeRelease() ;  Creation/destruction of temps may be important to capture 
-}
-
-int main(int argc, char *argv[])
-{
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-
-   Index_t edgeElems = 45 ;
-   Index_t edgeNodes = edgeElems+1 ;
-   // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   Index_t meshElems, meshNodes ;
-
-   /* get run options to measure various metrics */
-
-   /* ... */
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   mesh.sizeX()   = edgeElems ;
-   mesh.sizeY()   = edgeElems ;
-   mesh.sizeZ()   = edgeElems ;
-   mesh.numElem() = edgeElems*edgeElems*edgeElems ;
-   mesh.numNode() = edgeNodes*edgeNodes*edgeNodes ;
-
-   meshElems = mesh.numElem() ;
-   meshNodes = mesh.numNode() ;
-
-   /* allocate field memory */
-
-   mesh.AllocateElemPersistent(mesh.numElem()) ;
-   mesh.AllocateElemTemporary (mesh.numElem()) ;
-
-   mesh.AllocateNodalPersistent(mesh.numNode()) ;
-   mesh.AllocateNodesets(edgeNodes*edgeNodes) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<meshElems; ++i) {
-      mesh.e(i) = Real_t(0.0) ;
-      mesh.p(i) = Real_t(0.0) ;
-      mesh.q(i) = Real_t(0.0) ;
-      mesh.v(i) = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<meshNodes; ++i) {
-      mesh.xd(i) = Real_t(0.0) ;
-      mesh.yd(i) = Real_t(0.0) ;
-      mesh.zd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<meshNodes; ++i) {
-      mesh.xdd(i) = Real_t(0.0) ;
-      mesh.ydd(i) = Real_t(0.0) ;
-      mesh.zdd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<meshNodes; ++i) {
-      mesh.nodalMass(i) = Real_t(0.0) ;
-   }
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            mesh.x(nidx) = tx ;
-            mesh.y(nidx) = ty ;
-            mesh.z(nidx) = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_t *localNode = mesh.nodelist(zidx) ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   /* Create a material IndexSet (entire mesh same material for now) */
-   for (Index_t i=0; i<meshElems; ++i) {
-      mesh.matElemlist(i) = i ;
-   }
-   
-   /* initialize material parameters */
-   mesh.dtfixed() = Real_t(-1.0e-7) ;
-   mesh.deltatime() = Real_t(1.0e-7) ;
-   mesh.deltatimemultlb() = Real_t(1.1) ;
-   mesh.deltatimemultub() = Real_t(1.2) ;
-   mesh.stoptime()  = Real_t(1.0e-2) ;
-   mesh.dtcourant() = Real_t(1.0e+20) ;
-   mesh.dthydro()   = Real_t(1.0e+20) ;
-   mesh.dtmax()     = Real_t(1.0e-2) ;
-   mesh.time()    = Real_t(0.) ;
-   mesh.cycle()   = 0 ;
-
-   mesh.e_cut() = Real_t(1.0e-7) ;
-   mesh.p_cut() = Real_t(1.0e-7) ;
-   mesh.q_cut() = Real_t(1.0e-7) ;
-   mesh.u_cut() = Real_t(1.0e-7) ;
-   mesh.v_cut() = Real_t(1.0e-10) ;
-
-   mesh.hgcoef()      = Real_t(3.0) ;
-   mesh.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;
-
-   mesh.qstop()              =  Real_t(1.0e+12) ;
-   mesh.monoq_max_slope()    =  Real_t(1.0) ;
-   mesh.monoq_limiter_mult() =  Real_t(2.0) ;
-   mesh.qlc_monoq()          = Real_t(0.5) ;
-   mesh.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;
-   mesh.qqc()                = Real_t(2.0) ;
-
-   mesh.pmin() =  Real_t(0.) ;
-   mesh.emin() = Real_t(-1.0e+15) ;
-
-   mesh.dvovmax() =  Real_t(0.1) ;
-
-   mesh.eosvmax() =  Real_t(1.0e+9) ;
-   mesh.eosvmin() =  Real_t(1.0e-9) ;
-
-   mesh.refdens() =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<meshElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = mesh.nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = mesh.x(gnode);
-        y_local[lnode] = mesh.y(gnode);
-        z_local[lnode] = mesh.z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      mesh.volo(i) = volume ;
-      mesh.elemMass(i) = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         mesh.nodalMass(idx) += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* deposit energy */
-   mesh.e(0) = Real_t(3.948746e+7) ;
-
-   /* set up symmetry nodesets */
-   nidx = 0 ;
-   for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      Index_t rowInc   = i*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-         mesh.symmX(nidx) = planeInc + j*edgeNodes ;
-         mesh.symmY(nidx) = planeInc + j ;
-         mesh.symmZ(nidx) = rowInc   + j ;
-         ++nidx ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   mesh.lxim(0) = 0 ;
-   for (Index_t i=1; i<meshElems; ++i) {
-      mesh.lxim(i)   = i-1 ;
-      mesh.lxip(i-1) = i ;
-   }
-   mesh.lxip(meshElems-1) = meshElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      mesh.letam(i) = i ; 
-      mesh.letap(meshElems-edgeElems+i) = meshElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<meshElems; ++i) {
-      mesh.letam(i) = i-edgeElems ;
-      mesh.letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      mesh.lzetam(i) = i ;
-      mesh.lzetap(meshElems-edgeElems*edgeElems+i) = meshElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<meshElems; ++i) {
-      mesh.lzetam(i) = i - edgeElems*edgeElems ;
-      mesh.lzetap(i-edgeElems*edgeElems) = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<meshElems; ++i) {
-      mesh.elemBC(i) = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         mesh.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-         mesh.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-         mesh.elemBC(planeInc+j) |= ETA_M_SYMM ;
-         mesh.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;
-         mesh.elemBC(rowInc+j) |= ZETA_M_SYMM ;
-         mesh.elemBC(rowInc+j+meshElems-edgeElems*edgeElems) |= ZETA_P_FREE ;
-      }
-   }
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   int its=0;
-   while(mesh.time() < mesh.stoptime() ) {
-      TimeIncrement() ;
-      LagrangeLeapFrog() ;
-      its++;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-#if LULESH_SHOW_PROGRESS
-      printf("time = %e, dt=%e\n",
-             double(mesh.time()), double(mesh.deltatime()) ) ;
-#endif
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("iterations: %d\n",its);
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-   //   FILE *fp = fopen("x.asc","wb");
-   //for (Index_t i=0; i<mesh.numElem(); i++)
-   //    fprintf(fp,"%.6f\n",mesh.x(i));
-   //fclose(fp);
-               
-   return 0 ;
-}
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP.cc b/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP.cc
deleted file mode 100644
index b8d104f40..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP.cc
+++ /dev/null
@@ -1,3190 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#include "Timer.hxx"
-
-int show_run_progress = 0 ;
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-/****************************************************/
-/* Allow flexibility for arithmetic representations */
-/****************************************************/
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Index_t ; /* array subscript and loop index */
-typedef real8  Real_t ;  /* floating point representation */
-typedef int    Int_t ;   /* integer representation */
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-/************************************************************/
-/* Allow for flexible data layout experiments by separating */
-/* array interface from underlying implementation.          */
-/************************************************************/
-
-struct Domain {
-
-/* This first implementation allows for runnable code */
-/* and is not meant to be optimal. Final implementation */
-/* should separate declaration and allocation phases */
-/* so that allocation can be scheduled in a cache conscious */
-/* manner. */
-
-public:
-
-   /**************/
-   /* Allocation */
-   /**************/
-
-   void AllocateNodalPersistent(size_t size)
-   {
-      m_x.reserve(size) ;
-      m_y.reserve(size) ;
-      m_z.reserve(size) ;
-
-      m_xd.reserve(size) ;
-      m_yd.reserve(size) ;
-      m_zd.reserve(size) ;
-
-      m_xdd.reserve(size) ;
-      m_ydd.reserve(size) ;
-      m_zdd.reserve(size) ;
-
-      m_fx.reserve(size) ;
-      m_fy.reserve(size) ;
-      m_fz.reserve(size) ;
-
-      m_nodalMass.reserve(size) ;
-   }
-
-   void AllocateElemPersistent(size_t size)
-   {
-      m_matElemlist.reserve(size) ;
-      m_nodelist.reserve(8*size) ;
-
-      m_lxim.reserve(size) ;
-      m_lxip.reserve(size) ;
-      m_letam.reserve(size) ;
-      m_letap.reserve(size) ;
-      m_lzetam.reserve(size) ;
-      m_lzetap.reserve(size) ;
-
-      m_elemBC.reserve(size) ;
-
-      m_e.reserve(size) ;
-
-      m_p.reserve(size) ;
-      m_q.reserve(size) ;
-      m_ql.reserve(size) ;
-      m_qq.reserve(size) ;
-
-      m_v.reserve(size) ;
-      m_volo.reserve(size) ;
-      m_delv.reserve(size) ;
-      m_vdov.reserve(size) ;
-
-      m_arealg.reserve(size) ;
-   
-      m_ss.reserve(size) ;
-
-      m_elemMass.reserve(size) ;
-   }
-
-   /* Temporaries should not be initialized in bulk but */
-   /* this is a runnable placeholder for now */
-   void AllocateElemTemporary(size_t size)
-   {
-      m_dxx.reserve(size) ;
-      m_dyy.reserve(size) ;
-      m_dzz.reserve(size) ;
-
-      m_delv_xi.reserve(size) ;
-      m_delv_eta.reserve(size) ;
-      m_delv_zeta.reserve(size) ;
-
-      m_delx_xi.reserve(size) ;
-      m_delx_eta.reserve(size) ;
-      m_delx_zeta.reserve(size) ;
-
-      m_vnew.reserve(size) ;
-   }
-
-   void AllocateNodesets(size_t size)
-   {
-      m_symmX.reserve(size) ;
-      m_symmY.reserve(size) ;
-      m_symmZ.reserve(size) ;
-   }
-
-   void AllocateNodeElemIndexes()
-   {
-       Index_t m;
-       Index_t numElem = this->numElem() ;
-       Index_t numNode = this->numNode() ;
-
-       /* set up node-centered indexing of elements */
-       m_nodeElemCount.reserve(numNode);
-
-       for (Index_t i=0;i<numNode;++i) {
-          nodeElemCount(i)=0;
-       }
-
-       for (Index_t i=0; i<numElem; ++i) {
-          Index_t *nl = nodelist(i) ;
-          for (Index_t j=0; j < 8; ++j) {
-             ++nodeElemCount(nl[j]);
-          }
-       }
-
-       m_nodeElemStart.reserve(numNode);
-
-       nodeElemStart(0)=0;
-
-       for (Index_t i=1; i < numNode; ++i) {
-          nodeElemStart(i) = nodeElemStart(i-1) + nodeElemCount(i-1) ;
-       }
-
-//       m_nodeElemList.reserve(nodeElemStart(numNode-1) +
-//                             nodeElemCount(numNode-1));
-
-       m_nodeElemCornerList.reserve(nodeElemStart(numNode-1) +
-                                   nodeElemCount(numNode-1));
-
-       for (Index_t i=0; i < numNode; ++i) {
-          nodeElemCount(i)=0;
-       }
-
-       for (Index_t i=0; i < numElem; ++i) {
-          Index_t *nl = nodelist(i) ;
-          for (Index_t j=0; j < 8; ++j) {
-             Index_t m = nl[j];
-             Index_t k = i*8 + j ;
-             Index_t offset = nodeElemStart(m)+nodeElemCount(m) ;
-//             nodeElemList(offset) = i;
-             nodeElemCornerList(offset) = k;
-             ++nodeElemCount(m);
-          }
-       }
-
-       Index_t clSize = m_nodeElemCornerList.size() ;
-       for (Index_t i=0; i < clSize; ++i) {
-          Index_t clv = nodeElemCornerList(i) ;
-          if ((clv < 0) || (clv > numElem*8)) {
-               fprintf(stderr,
-        "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-               exit(1);
-          }
-      }
-   }
-
-   
-   /**********/
-   /* Access */
-   /**********/
-
-   /* Node-centered */
-
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   Index_t& symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t& symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t& symmZ(Index_t idx) { return m_symmZ[idx] ; }
-
-   Index_t& nodeElemCount(Index_t idx) { return m_nodeElemCount[idx] ; }
-   Index_t& nodeElemStart(Index_t idx) { return m_nodeElemStart[idx] ; }
-//   Index_t& nodeElemList(Index_t idx)  { return m_nodeElemList[idx] ; }
-   Index_t& nodeElemCornerList(Index_t i) { return m_nodeElemCornerList[i] ; }
-
-   /* Element-centered */
-
-   Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-   Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-   
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   /* Params */
-
-   Real_t& dtfixed()              { return m_dtfixed ; }
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-
-   Real_t& u_cut()                { return m_u_cut ; }
-   Real_t& hgcoef()               { return m_hgcoef ; }
-   Real_t& qstop()                { return m_qstop ; }
-   Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }
-   Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }
-   Real_t& e_cut()                { return m_e_cut ; }
-   Real_t& p_cut()                { return m_p_cut ; }
-   Real_t& ss4o3()                { return m_ss4o3 ; }
-   Real_t& q_cut()                { return m_q_cut ; }
-   Real_t& v_cut()                { return m_v_cut ; }
-   Real_t& qlc_monoq()            { return m_qlc_monoq ; }
-   Real_t& qqc_monoq()            { return m_qqc_monoq ; }
-   Real_t& qqc()                  { return m_qqc ; }
-   Real_t& eosvmax()              { return m_eosvmax ; }
-   Real_t& eosvmin()              { return m_eosvmin ; }
-   Real_t& pmin()                 { return m_pmin ; }
-   Real_t& emin()                 { return m_emin ; }
-   Real_t& dvovmax()              { return m_dvovmax ; }
-   Real_t& refdens()              { return m_refdens ; }
-
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-
-private:
-
-   /******************/
-   /* Implementation */
-   /******************/
-
-   /* Node-centered */
-
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   std::vector<Index_t> m_nodeElemCount ;
-   std::vector<Index_t> m_nodeElemStart ;
-//   std::vector<Index_t> m_nodeElemList ;
-   std::vector<Index_t> m_nodeElemCornerList ;
-
-   /* Element-centered */
-
-   std::vector<Index_t>  m_matElemlist ;  /* material indexset */
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   /* Parameters */
-
-   Real_t  m_dtfixed ;           /* fixed time increment */
-   Real_t  m_time ;              /* current time */
-   Real_t  m_deltatime ;         /* variable time increment */
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_stoptime ;          /* end time for simulation */
-
-   Real_t  m_u_cut ;             /* velocity tolerance */
-   Real_t  m_hgcoef ;            /* hourglass control */
-   Real_t  m_qstop ;             /* excessive q indicator */
-   Real_t  m_monoq_max_slope ;
-   Real_t  m_monoq_limiter_mult ;
-   Real_t  m_e_cut ;             /* energy tolerance */
-   Real_t  m_p_cut ;             /* pressure tolerance */
-   Real_t  m_ss4o3 ;
-   Real_t  m_q_cut ;             /* q tolerance */
-   Real_t  m_v_cut ;             /* relative volume tolerance */
-   Real_t  m_qlc_monoq ;         /* linear term coef for q */
-   Real_t  m_qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  m_qqc ;
-   Real_t  m_eosvmax ;
-   Real_t  m_eosvmin ;
-   Real_t  m_pmin ;              /* pressure floor */
-   Real_t  m_emin ;              /* energy floor */
-   Real_t  m_dvovmax ;           /* maximum allowable volume change */
-   Real_t  m_refdens ;           /* reference density */
-
-   Real_t  m_dtcourant ;         /* courant constraint */
-   Real_t  m_dthydro ;           /* volume change constraint */
-   Real_t  m_dtmax ;             /* maximum allowable time increment */
-
-   Int_t   m_cycle ;             /* iteration count for simulation */
-
-   Index_t   m_sizeX ;           /* X,Y,Z extent of this block */
-   Index_t   m_sizeY ;
-   Index_t   m_sizeZ ;
-
-   Index_t   m_numElem ;         /* Elements/Nodes in this domain */
-   Index_t   m_numNode ;
-} domain ;
-
-
-template <typename T>
-T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-static inline
-void TimeIncrement()
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (domain.dtcourant() < newdt) {
-         newdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < newdt) {
-         newdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-static inline
-void InitStressTermsForElems(Index_t numElem, 
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i){
-      sigxx[i] =  sigyy[i] = sigzz[i] =  - domain.p(i) - domain.q(i) ;
-   }
-}
-
-static inline
-void CalcElemShapeFunctionDerivatives( const Real_t* const x,
-                                       const Real_t* const y,
-                                       const Real_t* const z,
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-static inline
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-static inline
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-static inline
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* const fx,
-                                  Real_t* const fy,
-                                  Real_t* const fz )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-static inline
-void IntegrateStressForElems( Index_t numElem,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ)
-{
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fy_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fz_elem = Allocate<Real_t>(numElem8) ;
-
-  // loop over all elements
-#pragma omp parallel for firstprivate(numElem)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    const Index_t* const elemNodes = domain.nodelist(k);
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = domain.x(gnode);
-      y_local[lnode] = domain.y(gnode);
-      z_local[lnode] = domain.z(gnode);
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                 &fx_elem[k*8], &fy_elem[k*8], &fz_elem[k*8] ) ;
-
-#if 0
-    // copy nodal force contributions to global force arrray.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      domain.fx(gnode) += fx_local[lnode];
-      domain.fy(gnode) += fy_local[lnode];
-      domain.fz(gnode) += fz_local[lnode];
-    }
-#endif
-  }
-
-  {
-     Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-     for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-     {
-        Index_t count = domain.nodeElemCount(gnode) ;
-        Index_t start = domain.nodeElemStart(gnode) ;
-        Real_t fx = Real_t(0.0) ;
-        Real_t fy = Real_t(0.0) ;
-        Real_t fz = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t elem = domain.nodeElemCornerList(start+i) ;
-           fx += fx_elem[elem] ;
-           fy += fy_elem[elem] ;
-           fz += fz_elem[elem] ;
-        }
-        domain.fx(gnode) = fx ;
-        domain.fy(gnode) = fy ;
-        domain.fz(gnode) = fz ;
-     }
-  }
-
-  Release(&fz_elem) ;
-  Release(&fy_elem) ;
-  Release(&fx_elem) ;
-}
-
-
-static inline
-void CollectDomainNodesToElemNodes(const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain.x(nd0i);
-   elemX[1] = domain.x(nd1i);
-   elemX[2] = domain.x(nd2i);
-   elemX[3] = domain.x(nd3i);
-   elemX[4] = domain.x(nd4i);
-   elemX[5] = domain.x(nd5i);
-   elemX[6] = domain.x(nd6i);
-   elemX[7] = domain.x(nd7i);
-
-   elemY[0] = domain.y(nd0i);
-   elemY[1] = domain.y(nd1i);
-   elemY[2] = domain.y(nd2i);
-   elemY[3] = domain.y(nd3i);
-   elemY[4] = domain.y(nd4i);
-   elemY[5] = domain.y(nd5i);
-   elemY[6] = domain.y(nd6i);
-   elemY[7] = domain.y(nd7i);
-
-   elemZ[0] = domain.z(nd0i);
-   elemZ[1] = domain.z(nd1i);
-   elemZ[2] = domain.z(nd2i);
-   elemZ[3] = domain.z(nd3i);
-   elemZ[4] = domain.z(nd4i);
-   elemZ[5] = domain.z(nd5i);
-   elemZ[6] = domain.z(nd6i);
-   elemZ[7] = domain.z(nd7i);
-
-}
-
-static inline
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-static inline
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-static inline
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,
-                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
-                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
-                              Real_t *hourgam7, Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Index_t i00=0;
-   Index_t i01=1;
-   Index_t i02=2;
-   Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-static inline
-void CalcFBHourglassForceForElems(Real_t *determ,
-            Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
-            Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
-            Real_t hourg)
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-   Index_t numElem = domain.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fy_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fz_elem = Allocate<Real_t>(numElem8) ;
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-#pragma omp parallel for firstprivate(numElem, hourg) 
-   for(Index_t i2=0; i2<numElem; ++i2){
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-      Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = domain.nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain.ss(i2);
-      mass1=domain.elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain.xd(n0si2);
-      xd1[1] = domain.xd(n1si2);
-      xd1[2] = domain.xd(n2si2);
-      xd1[3] = domain.xd(n3si2);
-      xd1[4] = domain.xd(n4si2);
-      xd1[5] = domain.xd(n5si2);
-      xd1[6] = domain.xd(n6si2);
-      xd1[7] = domain.xd(n7si2);
-
-      yd1[0] = domain.yd(n0si2);
-      yd1[1] = domain.yd(n1si2);
-      yd1[2] = domain.yd(n2si2);
-      yd1[3] = domain.yd(n3si2);
-      yd1[4] = domain.yd(n4si2);
-      yd1[5] = domain.yd(n5si2);
-      yd1[6] = domain.yd(n6si2);
-      yd1[7] = domain.yd(n7si2);
-
-      zd1[0] = domain.zd(n0si2);
-      zd1[1] = domain.zd(n1si2);
-      zd1[2] = domain.zd(n2si2);
-      zd1[3] = domain.zd(n3si2);
-      zd1[4] = domain.zd(n4si2);
-      zd1[5] = domain.zd(n5si2);
-      zd1[6] = domain.zd(n6si2);
-      zd1[7] = domain.zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      fx_local = &fx_elem[i3] ;
-      fx_local[0] = hgfx[0];
-      fx_local[1] = hgfx[1];
-      fx_local[2] = hgfx[2];
-      fx_local[3] = hgfx[3];
-      fx_local[4] = hgfx[4];
-      fx_local[5] = hgfx[5];
-      fx_local[6] = hgfx[6];
-      fx_local[7] = hgfx[7];
-
-      fy_local = &fy_elem[i3] ;
-      fy_local[0] = hgfy[0];
-      fy_local[1] = hgfy[1];
-      fy_local[2] = hgfy[2];
-      fy_local[3] = hgfy[3];
-      fy_local[4] = hgfy[4];
-      fy_local[5] = hgfy[5];
-      fy_local[6] = hgfy[6];
-      fy_local[7] = hgfy[7];
-
-      fz_local = &fz_elem[i3] ;
-      fz_local[0] = hgfz[0];
-      fz_local[1] = hgfz[1];
-      fz_local[2] = hgfz[2];
-      fz_local[3] = hgfz[3];
-      fz_local[4] = hgfz[4];
-      fz_local[5] = hgfz[5];
-      fz_local[6] = hgfz[6];
-      fz_local[7] = hgfz[7];
-
-#if 0
-      domain.fx(n0si2) += hgfx[0];
-      domain.fy(n0si2) += hgfy[0];
-      domain.fz(n0si2) += hgfz[0];
-
-      domain.fx(n1si2) += hgfx[1];
-      domain.fy(n1si2) += hgfy[1];
-      domain.fz(n1si2) += hgfz[1];
-
-      domain.fx(n2si2) += hgfx[2];
-      domain.fy(n2si2) += hgfy[2];
-      domain.fz(n2si2) += hgfz[2];
-
-      domain.fx(n3si2) += hgfx[3];
-      domain.fy(n3si2) += hgfy[3];
-      domain.fz(n3si2) += hgfz[3];
-
-      domain.fx(n4si2) += hgfx[4];
-      domain.fy(n4si2) += hgfy[4];
-      domain.fz(n4si2) += hgfz[4];
-
-      domain.fx(n5si2) += hgfx[5];
-      domain.fy(n5si2) += hgfy[5];
-      domain.fz(n5si2) += hgfz[5];
-
-      domain.fx(n6si2) += hgfx[6];
-      domain.fy(n6si2) += hgfy[6];
-      domain.fz(n6si2) += hgfz[6];
-
-      domain.fx(n7si2) += hgfx[7];
-      domain.fy(n7si2) += hgfy[7];
-      domain.fz(n7si2) += hgfz[7];
-#endif
-   }
-
-  {
-     Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-     for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-     {
-        Index_t count = domain.nodeElemCount(gnode) ;
-        Index_t start = domain.nodeElemStart(gnode) ;
-        Real_t fx = Real_t(0.0) ;
-        Real_t fy = Real_t(0.0) ;
-        Real_t fz = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t elem = domain.nodeElemCornerList(start+i) ;
-           fx += fx_elem[elem] ;
-           fy += fy_elem[elem] ;
-           fz += fz_elem[elem] ;
-        }
-        domain.fx(gnode) += fx ;
-        domain.fy(gnode) += fy ;
-        domain.fz(gnode) += fz ;
-     }
-  }
-
-  Release(&fz_elem) ;
-  Release(&fy_elem) ;
-  Release(&fx_elem) ;
-}
-
-static inline
-void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i=0 ; i<numElem ; ++i){
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain.nodelist(i);
-      CollectDomainNodesToElemNodes(elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii){
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain.volo(i) * domain.v(i);
-
-      /* Do a check for negative volumes */
-      if ( domain.v(i) <= Real_t(0.0) ) {
-         exit(VolumeError) ;
-      }
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-static inline
-void CalcVolumeForceForElems()
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain.hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(numElem, sigxx, sigyy, sigzz);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ) ;
-
-      // check for negative element volume
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k ) {
-         if (determ[k] <= Real_t(0.0)) {
-            exit(VolumeError) ;
-         }
-      }
-
-      CalcHourglassControlForElems(determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-static inline void CalcForceForNodes()
-{
-  Index_t numNode = domain.numNode() ;
-#pragma omp parallel for firstprivate(numNode)
-  for (Index_t i=0; i<numNode; ++i) {
-     domain.fx(i) = Real_t(0.0) ;
-     domain.fy(i) = Real_t(0.0) ;
-     domain.fz(i) = Real_t(0.0) ;
-  }
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems() ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-static inline
-void CalcAccelerationForNodes()
-{
-   Index_t numNode = domain.numNode() ;
-#pragma omp parallel for firstprivate(numNode)
-   for (Index_t i = 0; i < numNode; ++i) {
-      domain.xdd(i) = domain.fx(i) / domain.nodalMass(i);
-      domain.ydd(i) = domain.fy(i) / domain.nodalMass(i);
-      domain.zdd(i) = domain.fz(i) / domain.nodalMass(i);
-   }
-}
-
-
-static inline
-void ApplyAccelerationBoundaryConditionsForNodes()
-{
-  Index_t numNodeBC = (domain.sizeX()+1)*(domain.sizeX()+1) ;
- 
-#pragma omp parallel
-{
-#pragma omp for nowait firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.xdd(domain.symmX(i)) = Real_t(0.0) ;
-
-#pragma omp for nowait firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.ydd(domain.symmY(i)) = Real_t(0.0) ;
-
-#pragma omp for firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.zdd(domain.symmZ(i)) = Real_t(0.0) ;
-}
-}
-
-static inline
-void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut)
-{
-   Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain.xd(i) + domain.xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain.xd(i) = xdtmp ;
-
-     ydtmp = domain.yd(i) + domain.ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain.yd(i) = ydtmp ;
-
-     zdtmp = domain.zd(i) + domain.zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain.zd(i) = zdtmp ;
-   }
-}
-
-static inline
-void CalcPositionForNodes(const Real_t dt)
-{
-   Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     domain.x(i) += domain.xd(i) * dt ;
-     domain.y(i) += domain.yd(i) * dt ;
-     domain.z(i) += domain.zd(i) * dt ;
-   }
-}
-
-static inline
-void LagrangeNodal()
-{
-  const Real_t delt = domain.deltatime() ;
-  Real_t u_cut = domain.u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes();
-
-  CalcAccelerationForNodes();
-
-  ApplyAccelerationBoundaryConditionsForNodes();
-
-  CalcVelocityForNodes( delt, u_cut ) ;
-
-  CalcPositionForNodes( delt );
-
-  return;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-static inline
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-static inline
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-static inline
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                               const Real_t* const yvel,
-                               const Real_t* const zvel,
-                               const Real_t b[][8],
-                               const Real_t detJ,
-                               Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-static inline
-void CalcKinematicsForElems( Index_t numElem, Real_t dt )
-{
-  // loop over all elements
-#pragma omp parallel for firstprivate(numElem, dt)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-     Real_t B[3][8] ; /** shape function derivatives */
-     Real_t D[6] ;
-     Real_t x_local[8] ;
-     Real_t y_local[8] ;
-     Real_t z_local[8] ;
-     Real_t xd_local[8] ;
-     Real_t yd_local[8] ;
-     Real_t zd_local[8] ;
-     Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain.nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = domain.x(gnode);
-      y_local[lnode] = domain.y(gnode);
-      z_local[lnode] = domain.z(gnode);
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain.volo(k) ;
-    domain.vnew(k) = relativeVolume ;
-    domain.delv(k) = relativeVolume - domain.v(k) ;
-
-    // set characteristic length
-    domain.arealg(k) = CalcElemCharacteristicLength(x_local,
-                                                  y_local,
-                                                  z_local,
-                                                  volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain.xd(gnode);
-      yd_local[lnode] = domain.yd(gnode);
-      zd_local[lnode] = domain.zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * dt;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local,
-                                          y_local,
-                                          z_local,
-                                          B, &detJ );
-
-    CalcElemVelocityGradient( xd_local,
-                              yd_local,
-                              zd_local,
-                              B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain.dxx(k) = D[0];
-    domain.dyy(k) = D[1];
-    domain.dzz(k) = D[2];
-  }
-}
-
-static inline
-void CalcLagrangeElements(Real_t deltatime)
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem > 0) {
-      CalcKinematicsForElems(numElem, deltatime) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k )
-      {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = domain.dxx(k) + domain.dyy(k) + domain.dzz(k) ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        domain.vdov(k) = vdov ;
-        domain.dxx(k) -= vdovthird ;
-        domain.dyy(k) -= vdovthird ;
-        domain.dzz(k) -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-        if (domain.vnew(k) <= Real_t(0.0))
-        {
-           exit(VolumeError) ;
-        }
-      }
-   }
-}
-
-static inline
-void CalcMonotonicQGradientsForElems()
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-   Index_t numElem = domain.numElem() ;
-
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i ) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain.nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain.x(n0) ;
-      Real_t x1 = domain.x(n1) ;
-      Real_t x2 = domain.x(n2) ;
-      Real_t x3 = domain.x(n3) ;
-      Real_t x4 = domain.x(n4) ;
-      Real_t x5 = domain.x(n5) ;
-      Real_t x6 = domain.x(n6) ;
-      Real_t x7 = domain.x(n7) ;
-
-      Real_t y0 = domain.y(n0) ;
-      Real_t y1 = domain.y(n1) ;
-      Real_t y2 = domain.y(n2) ;
-      Real_t y3 = domain.y(n3) ;
-      Real_t y4 = domain.y(n4) ;
-      Real_t y5 = domain.y(n5) ;
-      Real_t y6 = domain.y(n6) ;
-      Real_t y7 = domain.y(n7) ;
-
-      Real_t z0 = domain.z(n0) ;
-      Real_t z1 = domain.z(n1) ;
-      Real_t z2 = domain.z(n2) ;
-      Real_t z3 = domain.z(n3) ;
-      Real_t z4 = domain.z(n4) ;
-      Real_t z5 = domain.z(n5) ;
-      Real_t z6 = domain.z(n6) ;
-      Real_t z7 = domain.z(n7) ;
-
-      Real_t xv0 = domain.xd(n0) ;
-      Real_t xv1 = domain.xd(n1) ;
-      Real_t xv2 = domain.xd(n2) ;
-      Real_t xv3 = domain.xd(n3) ;
-      Real_t xv4 = domain.xd(n4) ;
-      Real_t xv5 = domain.xd(n5) ;
-      Real_t xv6 = domain.xd(n6) ;
-      Real_t xv7 = domain.xd(n7) ;
-
-      Real_t yv0 = domain.yd(n0) ;
-      Real_t yv1 = domain.yd(n1) ;
-      Real_t yv2 = domain.yd(n2) ;
-      Real_t yv3 = domain.yd(n3) ;
-      Real_t yv4 = domain.yd(n4) ;
-      Real_t yv5 = domain.yd(n5) ;
-      Real_t yv6 = domain.yd(n6) ;
-      Real_t yv7 = domain.yd(n7) ;
-
-      Real_t zv0 = domain.zd(n0) ;
-      Real_t zv1 = domain.zd(n1) ;
-      Real_t zv2 = domain.zd(n2) ;
-      Real_t zv3 = domain.zd(n3) ;
-      Real_t zv4 = domain.zd(n4) ;
-      Real_t zv5 = domain.zd(n5) ;
-      Real_t zv6 = domain.zd(n6) ;
-      Real_t zv7 = domain.zd(n7) ;
-
-      Real_t vol = domain.volo(i)*domain.vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      domain.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      domain.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      domain.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   }
-#undef SUM4
-}
-
-static inline
-void CalcMonotonicQRegionForElems(// parameters
-                          Real_t qlc_monoq,
-                          Real_t qqc_monoq,
-                          Real_t monoq_limiter_mult,
-                          Real_t monoq_max_slope,
-                          Real_t ptiny,
-
-                          // the elementset length
-                          Index_t elength )
-{
-#pragma omp parallel for firstprivate(elength, qlc_monoq, qqc_monoq, monoq_limiter_mult, monoq_max_slope, ptiny)
-   for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Index_t i = domain.matElemlist(ielem);
-      Int_t bcMask = domain.elemBC(i) ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( domain.delv_xi(i) + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = domain.delv_xi(domain.lxim(i)) ; break ;
-         case XI_M_SYMM: delvm = domain.delv_xi(i) ;            break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = domain.delv_xi(domain.lxip(i)) ; break ;
-         case XI_P_SYMM: delvp = domain.delv_xi(i) ;            break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain.delv_eta(i) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = domain.delv_eta(domain.letam(i)) ; break ;
-         case ETA_M_SYMM: delvm = domain.delv_eta(i) ;             break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = domain.delv_eta(domain.letap(i)) ; break ;
-         case ETA_P_SYMM: delvp = domain.delv_eta(i) ;             break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain.delv_zeta(i) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = domain.delv_zeta(domain.lzetam(i)) ; break ;
-         case ZETA_M_SYMM: delvm = domain.delv_zeta(i) ;              break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = domain.delv_zeta(domain.lzetap(i)) ; break ;
-         case ZETA_P_SYMM: delvp = domain.delv_zeta(i) ;              break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain.vdov(i) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain.delv_xi(i)   * domain.delx_xi(i)   ;
-         Real_t delvxeta  = domain.delv_eta(i)  * domain.delx_eta(i)  ;
-         Real_t delvxzeta = domain.delv_zeta(i) * domain.delx_zeta(i) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain.elemMass(i) / (domain.volo(i) * domain.vnew(i)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain.qq(i) = qquad ;
-      domain.ql(i) = qlin  ;
-   }
-}
-
-static inline
-void CalcMonotonicQForElems()
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny        = Real_t(1.e-36) ;
-   Real_t monoq_max_slope    = domain.monoq_max_slope() ;
-   Real_t monoq_limiter_mult = domain.monoq_limiter_mult() ;
-
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t elength = domain.numElem() ;
-   if (elength > 0) {
-      Real_t qlc_monoq = domain.qlc_monoq();
-      Real_t qqc_monoq = domain.qqc_monoq();
-      CalcMonotonicQRegionForElems(// parameters
-                           qlc_monoq,
-                           qqc_monoq,
-                           monoq_limiter_mult,
-                           monoq_max_slope,
-                           ptiny,
-
-                           // the elemset length
-                           elength );
-   }
-}
-
-static inline
-void CalcQForElems()
-{
-   Real_t qstop = domain.qstop() ;
-   Index_t numElem = domain.numElem() ;
-
-   //
-   // MONOTONIC Q option
-   //
-
-   /* Calculate velocity gradients */
-   CalcMonotonicQGradientsForElems() ;
-
-   /* Transfer veloctiy gradients in the first order elements */
-   /* problem->commElements->Transfer(CommElements::monoQ) ; */
-   CalcMonotonicQForElems() ;
-
-   /* Don't allow excessive artificial viscosity */
-   if (numElem != 0) {
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain.q(i) > qstop ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-static inline
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length)
-{
-
-#pragma omp parallel for firstprivate(length)
-   for (Index_t i = 0; i < length ; ++i) {
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-   }
-
-#pragma omp parallel for firstprivate(length, pmin, p_cut, eosvmax)
-   for (Index_t i = 0 ; i < length ; ++i){
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-   }
-}
-
-static inline
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old, Real_t* e_old, Real_t* q_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t* qq, Real_t* ql,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        Index_t length)
-{
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-#pragma omp parallel for firstprivate(length, emin)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc =Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql[i] + qq[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
-   }
-
-#pragma omp parallel for firstprivate(length, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i) {
-
-      e_new[i] += Real_t(0.5) * work[i];
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql[i] + qq[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0, q_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql[i] + qq[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-   }
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-static inline
-void CalcSoundSpeedForElems(Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3, Index_t nz)
-{
-#pragma omp parallel for firstprivate(nz, rho0, ss4o3)
-   for (Index_t i = 0; i < nz ; ++i) {
-      Index_t iz = domain.matElemlist(i);
-      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *
-                 bvc[i] * pnewc[i]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp) ;
-      }
-      domain.ss(iz) = ssTmp ;
-   }
-}
-
-static inline
-void EvalEOSForElems(Real_t *vnewc, Index_t length)
-{
-   Real_t  e_cut = domain.e_cut();
-   Real_t  p_cut = domain.p_cut();
-   Real_t  ss4o3 = domain.ss4o3();
-   Real_t  q_cut = domain.q_cut();
-
-   Real_t eosvmax = domain.eosvmax() ;
-   Real_t eosvmin = domain.eosvmin() ;
-   Real_t pmin    = domain.pmin() ;
-   Real_t emin    = domain.emin() ;
-   Real_t rho0    = domain.refdens() ;
-
-   Real_t *e_old = Allocate<Real_t>(length) ;
-   Real_t *delvc = Allocate<Real_t>(length) ;
-   Real_t *p_old = Allocate<Real_t>(length) ;
-   Real_t *q_old = Allocate<Real_t>(length) ;
-   Real_t *compression = Allocate<Real_t>(length) ;
-   Real_t *compHalfStep = Allocate<Real_t>(length) ;
-   Real_t *qq = Allocate<Real_t>(length) ;
-   Real_t *ql = Allocate<Real_t>(length) ;
-   Real_t *work = Allocate<Real_t>(length) ;
-   Real_t *p_new = Allocate<Real_t>(length) ;
-   Real_t *e_new = Allocate<Real_t>(length) ;
-   Real_t *q_new = Allocate<Real_t>(length) ;
-   Real_t *bvc = Allocate<Real_t>(length) ;
-   Real_t *pbvc = Allocate<Real_t>(length) ;
-
-   /* compress data, minimal set */
-#pragma omp parallel
-   {
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         e_old[i] = domain.e(zidx) ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         delvc[i] = domain.delv(zidx) ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         p_old[i] = domain.p(zidx) ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         q_old[i] = domain.q(zidx) ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i = 0; i < length ; ++i) {
-         Real_t vchalf ;
-         compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);
-         vchalf = vnewc[i] - delvc[i] * Real_t(.5);
-         compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
-      }
-
-   /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(length,eosvmin)
-         for(Index_t i=0 ; i<length ; ++i) {
-            if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[i] = compression[i] ;
-            }
-         }
-      }
-      if ( eosvmax != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(length,eosvmax)
-         for(Index_t i=0 ; i<length ; ++i) {
-            if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */
-               p_old[i]        = Real_t(0.) ;
-               compression[i]  = Real_t(0.) ;
-               compHalfStep[i] = Real_t(0.) ;
-            }
-         }
-      }
-
-#pragma omp for firstprivate(length)
-      for (Index_t i = 0 ; i < length ; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         qq[i] = domain.qq(zidx) ;
-         ql[i] = domain.ql(zidx) ;
-         work[i] = Real_t(0.) ; 
-      }
-   }
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, e_old,  q_old, compression, compHalfStep,
-                 vnewc, work,  delvc, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 qq, ql, rho0, eosvmax, length);
-
-
-#pragma omp parallel
-   {
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         domain.p(zidx) = p_new[i] ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         domain.e(zidx) = e_new[i] ;
-      }
-
-#pragma omp for firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t zidx = domain.matElemlist(i) ;
-         domain.q(zidx) = q_new[i] ;
-      }
-   }
-
-   CalcSoundSpeedForElems(vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3, length) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&ql) ;
-   Release(&qq) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&q_old) ;
-   Release(&p_old) ;
-   Release(&delvc) ;
-   Release(&e_old) ;
-}
-
-static inline
-void ApplyMaterialPropertiesForElems()
-{
-  Index_t length = domain.numElem() ;
-
-  if (length != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain.eosvmin() ;
-    Real_t eosvmax = domain.eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(length) ;
-
-#pragma omp parallel
-    {
-#pragma omp for nowait firstprivate(length)
-       for (Index_t i=0 ; i<length ; ++i) {
-          Index_t zn = domain.matElemlist(i) ;
-          vnewc[i] = domain.vnew(zn) ;
-       }
-
-       if (eosvmin != Real_t(0.)) {
-#pragma omp for nowait firstprivate(length,eosvmin)
-          for(Index_t i=0 ; i<length ; ++i) {
-             if (vnewc[i] < eosvmin)
-                vnewc[i] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-#pragma omp for nowait firstprivate(length,eosvmax)
-          for(Index_t i=0 ; i<length ; ++i) {
-             if (vnewc[i] > eosvmax)
-                vnewc[i] = eosvmax ;
-          }
-       }
-
-#pragma omp for firstprivate(length,eosvmin,eosvmax)
-       for (Index_t i=0; i<length; ++i) {
-          Index_t zn = domain.matElemlist(i) ;
-          Real_t vc = domain.v(zn) ;
-          if (eosvmin != Real_t(0.)) {
-             if (vc < eosvmin)
-                vc = eosvmin ;
-          }
-          if (eosvmax != Real_t(0.)) {
-             if (vc > eosvmax)
-                vc = eosvmax ;
-          }
-          if (vc <= 0.) {
-             exit(VolumeError) ;
-          }
-       }
-    }
-
-    EvalEOSForElems(vnewc, length);
-
-    Release(&vnewc) ;
-
-  }
-}
-
-static inline
-void UpdateVolumesForElems()
-{
-   Index_t numElem = domain.numElem();
-   if (numElem != 0) {
-      Real_t v_cut = domain.v_cut();
-
-#pragma omp parallel for firstprivate(numElem,v_cut)
-      for(Index_t i=0 ; i<numElem ; ++i) {
-         Real_t tmpV ;
-         tmpV = domain.vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-         domain.v(i) = tmpV ;
-      }
-   }
-
-   return ;
-}
-
-static inline
-void LagrangeElements()
-{
-  const Real_t deltatime = domain.deltatime() ;
-
-  CalcLagrangeElements(deltatime) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems() ;
-
-  ApplyMaterialPropertiesForElems() ;
-
-  UpdateVolumesForElems() ;
-}
-
-static inline
-void CalcCourantConstraintForElems()
-{
-   Real_t dtcourant = Real_t(1.0e+20) ;
-   Index_t   courant_elem = -1 ;
-   Real_t      qqc = domain.qqc() ;
-   Index_t length = domain.numElem() ;
-
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-#pragma omp parallel for firstprivate(length,qqc2), shared(dtcourant,courant_elem)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = domain.matElemlist(i) ;
-
-      Real_t dtf = domain.ss(indx) * domain.ss(indx) ;
-
-      if ( domain.vdov(indx) < Real_t(0.) ) {
-
-         dtf = dtf
-            + qqc2 * domain.arealg(indx) * domain.arealg(indx)
-            * domain.vdov(indx) * domain.vdov(indx) ;
-      }
-
-      dtf = SQRT(dtf) ;
-
-      dtf = domain.arealg(indx) / dtf ;
-
-   /* determine minimum timestep with its corresponding elem */
-      if (domain.vdov(indx) != Real_t(0.)) {
-#pragma omp critical
-         {
-            if ( dtf < dtcourant ) {
-               dtcourant = dtf ;
-               courant_elem = indx ;
-            }
-         }
-      }
-   }
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (courant_elem != -1) {
-      domain.dtcourant() = dtcourant ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcHydroConstraintForElems()
-{
-   Real_t dthydro = Real_t(1.0e+20) ;
-   Index_t hydro_elem = -1 ;
-   Real_t dvovmax = domain.dvovmax() ;
-   Index_t length = domain.numElem() ;
-
-#pragma omp parallel for firstprivate(length), shared(dthydro,hydro_elem)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = domain.matElemlist(i) ;
-
-      if (domain.vdov(indx) != Real_t(0.)) {
-         Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx))+Real_t(1.e-20)) ;
-#pragma omp critical
-         {
-            if ( dthydro > dtdvov ) {
-               dthydro = dtdvov ;
-               hydro_elem = indx ;
-            }
-         }
-      }
-   }
-
-   if (hydro_elem != -1) {
-      domain.dthydro() = dthydro ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcTimeConstraintsForElems() {
-   /* evaluate time constraint */
-   CalcCourantConstraintForElems() ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems() ;
-}
-
-static inline
-void LagrangeLeapFrog()
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal();
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements();
-
-   CalcTimeConstraintsForElems();
-
-   // LagrangeRelease() ;  Creation/destruction of temps may be important to capture 
-}
-
-int main(int argc, char *argv[])
-{
-
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-   int maxIter = 1024*1024 ;
-   Index_t edgeElems = 45 ;
-
-   for (int i=1; i<argc; ++i) {
-      if (strcmp(argv[i], "-p") == 0) {
-         show_run_progress = 1 ;
-      }
-      else if (strcmp(argv[i], "-i") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            maxIter = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Iteration (-i) option has bad argument -- ignoring\n") ;
-         }
-      }
-      else if (strcmp(argv[i], "-s") == 0) {
-         if ((i+1 < argc) && isdigit(argv[i+1][0])) {
-            edgeElems = atoi(argv[i+1]) ;
-            ++i;
-         }
-         else  {
-            printf("Size (-s) option has bad argument -- ignoring\n") ;
-         }
-      }
-   }
-
-
-   Index_t edgeNodes = edgeElems+1 ;
-   // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   Index_t domElems, domNodes ;
-
-   /* get run options to measure various metrics */
-
-   /* ... */
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   domain.sizeX()   = edgeElems ;
-   domain.sizeY()   = edgeElems ;
-   domain.sizeZ()   = edgeElems ;
-   domain.numElem() = edgeElems*edgeElems*edgeElems ;
-   domain.numNode() = edgeNodes*edgeNodes*edgeNodes ;
-
-   domElems = domain.numElem() ;
-   domNodes = domain.numNode() ;
-
-   /* allocate field memory */
-
-   domain.AllocateElemPersistent(domain.numElem()) ;
-   domain.AllocateElemTemporary (domain.numElem()) ;
-
-   domain.AllocateNodalPersistent(domain.numNode()) ;
-   domain.AllocateNodesets(edgeNodes*edgeNodes) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.e(i) = Real_t(0.0) ;
-      domain.p(i) = Real_t(0.0) ;
-      domain.q(i) = Real_t(0.0) ;
-      domain.v(i) = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xd(i) = Real_t(0.0) ;
-      domain.yd(i) = Real_t(0.0) ;
-      domain.zd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xdd(i) = Real_t(0.0) ;
-      domain.ydd(i) = Real_t(0.0) ;
-      domain.zdd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.nodalMass(i) = Real_t(0.0) ;
-   }
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            domain.x(nidx) = tx ;
-            domain.y(nidx) = ty ;
-            domain.z(nidx) = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_t *localNode = domain.nodelist(zidx) ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   domain.AllocateNodeElemIndexes() ;
-
-   /* Create a material IndexSet (entire domain same material for now) */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.matElemlist(i) = i ;
-   }
-   
-   /* initialize material parameters */
-   domain.dtfixed() = Real_t(-1.0e-7) ;
-   domain.deltatime() = Real_t(1.0e-7) ;
-   domain.deltatimemultlb() = Real_t(1.1) ;
-   domain.deltatimemultub() = Real_t(1.2) ;
-   domain.stoptime()  = Real_t(1.0e-2) ;
-   domain.dtcourant() = Real_t(1.0e+20) ;
-   domain.dthydro()   = Real_t(1.0e+20) ;
-   domain.dtmax()     = Real_t(1.0e-2) ;
-   domain.time()    = Real_t(0.) ;
-   domain.cycle()   = 0 ;
-
-   domain.e_cut() = Real_t(1.0e-7) ;
-   domain.p_cut() = Real_t(1.0e-7) ;
-   domain.q_cut() = Real_t(1.0e-7) ;
-   domain.u_cut() = Real_t(1.0e-7) ;
-   domain.v_cut() = Real_t(1.0e-10) ;
-
-   domain.hgcoef()      = Real_t(3.0) ;
-   domain.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;
-
-   domain.qstop()              =  Real_t(1.0e+12) ;
-   domain.monoq_max_slope()    =  Real_t(1.0) ;
-   domain.monoq_limiter_mult() =  Real_t(2.0) ;
-   domain.qlc_monoq()          = Real_t(0.5) ;
-   domain.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;
-   domain.qqc()                = Real_t(2.0) ;
-
-   domain.pmin() =  Real_t(0.) ;
-   domain.emin() = Real_t(-1.0e+15) ;
-
-   domain.dvovmax() =  Real_t(0.1) ;
-
-   domain.eosvmax() =  Real_t(1.0e+9) ;
-   domain.eosvmin() =  Real_t(1.0e-9) ;
-
-   domain.refdens() =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<domElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = domain.nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = domain.x(gnode);
-        y_local[lnode] = domain.y(gnode);
-        z_local[lnode] = domain.z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      domain.volo(i) = volume ;
-      domain.elemMass(i) = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         domain.nodalMass(idx) += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* deposit energy */
-   domain.e(0) = Real_t(3.948746e+7) ;
-
-   /* set up symmetry nodesets */
-   nidx = 0 ;
-   for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      Index_t rowInc   = i*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-         domain.symmX(nidx) = planeInc + j*edgeNodes ;
-         domain.symmY(nidx) = planeInc + j ;
-         domain.symmZ(nidx) = rowInc   + j ;
-         ++nidx ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   domain.lxim(0) = 0 ;
-   for (Index_t i=1; i<domElems; ++i) {
-      domain.lxim(i)   = i-1 ;
-      domain.lxip(i-1) = i ;
-   }
-   domain.lxip(domElems-1) = domElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      domain.letam(i) = i ; 
-      domain.letap(domElems-edgeElems+i) = domElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<domElems; ++i) {
-      domain.letam(i) = i-edgeElems ;
-      domain.letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      domain.lzetam(i) = i ;
-      domain.lzetap(domElems-edgeElems*edgeElems+i) = domElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<domElems; ++i) {
-      domain.lzetam(i) = i - edgeElems*edgeElems ;
-      domain.lzetap(i-edgeElems*edgeElems) = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.elemBC(i) = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         domain.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-         domain.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-         domain.elemBC(planeInc+j) |= ETA_M_SYMM ;
-         domain.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;
-         domain.elemBC(rowInc+j) |= ZETA_M_SYMM ;
-         domain.elemBC(rowInc+j+domElems-edgeElems*edgeElems) |= ZETA_P_FREE ;
-      }
-   }
-
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   while((domain.time() < domain.stoptime()) && (domain.cycle() < maxIter)) {
-      TimeIncrement() ;
-      LagrangeLeapFrog() ;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-      if (show_run_progress != 0) {
-         printf("time = %e, dt=%e\n",
-                double(domain.time()), double(domain.deltatime()) ) ;
-      }
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-
-   return 0 ;
-}
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP_NG.cc b/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP_NG.cc
deleted file mode 100644
index 3eaa71ad0..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_baseline/luleshOMP_NG.cc
+++ /dev/null
@@ -1,3142 +0,0 @@
-/*
-
-                 Copyright (c) 2010.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 1.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "Timer.hxx"
-
-
-#define LULESH_SHOW_PROGRESS 0
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-/****************************************************/
-/* Allow flexibility for arithmetic representations */
-/****************************************************/
-
-/* Could also support fixed point and interval arithmetic types */
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  /* 10 bytes on x86 */
-
-typedef int    Index_t ; /* array subscript and loop index */
-typedef real8  Real_t ;  /* floating point representation */
-typedef int    Int_t ;   /* integer representation */
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-/************************************************************/
-/* Allow for flexible data layout experiments by separating */
-/* array interface from underlying implementation.          */
-/************************************************************/
-
-struct Domain {
-
-/* This first implementation allows for runnable code */
-/* and is not meant to be optimal. Final implementation */
-/* should separate declaration and allocation phases */
-/* so that allocation can be scheduled in a cache conscious */
-/* manner. */
-
-public:
-
-   /**************/
-   /* Allocation */
-   /**************/
-
-   void AllocateNodalPersistent(size_t size)
-   {
-      m_x.reserve(size) ;
-      m_y.reserve(size) ;
-      m_z.reserve(size) ;
-
-      m_xd.reserve(size) ;
-      m_yd.reserve(size) ;
-      m_zd.reserve(size) ;
-
-      m_xdd.reserve(size) ;
-      m_ydd.reserve(size) ;
-      m_zdd.reserve(size) ;
-
-      m_fx.reserve(size) ;
-      m_fy.reserve(size) ;
-      m_fz.reserve(size) ;
-
-      m_nodalMass.reserve(size) ;
-   }
-
-   void AllocateElemPersistent(size_t size)
-   {
-      m_matElemlist.reserve(size) ;
-      m_nodelist.reserve(8*size) ;
-
-      m_lxim.reserve(size) ;
-      m_lxip.reserve(size) ;
-      m_letam.reserve(size) ;
-      m_letap.reserve(size) ;
-      m_lzetam.reserve(size) ;
-      m_lzetap.reserve(size) ;
-
-      m_elemBC.reserve(size) ;
-
-      m_e.reserve(size) ;
-
-      m_p.reserve(size) ;
-      m_q.reserve(size) ;
-      m_ql.reserve(size) ;
-      m_qq.reserve(size) ;
-
-      m_v.reserve(size) ;
-      m_volo.reserve(size) ;
-      m_delv.reserve(size) ;
-      m_vdov.reserve(size) ;
-
-      m_arealg.reserve(size) ;
-   
-      m_ss.reserve(size) ;
-
-      m_elemMass.reserve(size) ;
-   }
-
-   /* Temporaries should not be initialized in bulk but */
-   /* this is a runnable placeholder for now */
-   void AllocateElemTemporary(size_t size)
-   {
-      m_dxx.reserve(size) ;
-      m_dyy.reserve(size) ;
-      m_dzz.reserve(size) ;
-
-      m_delv_xi.reserve(size) ;
-      m_delv_eta.reserve(size) ;
-      m_delv_zeta.reserve(size) ;
-
-      m_delx_xi.reserve(size) ;
-      m_delx_eta.reserve(size) ;
-      m_delx_zeta.reserve(size) ;
-
-      m_vnew.reserve(size) ;
-   }
-
-   void AllocateNodesets(size_t size)
-   {
-      m_symmX.reserve(size) ;
-      m_symmY.reserve(size) ;
-      m_symmZ.reserve(size) ;
-   }
-
-   void AllocateNodeElemIndexes()
-   {
-       Index_t m;
-       Index_t numElem = this->numElem() ;
-       Index_t numNode = this->numNode() ;
-
-       /* set up node-centered indexing of elements */
-       m_nodeElemCount.reserve(numNode);
-
-       for (Index_t i=0;i<numNode;++i) {
-          nodeElemCount(i)=0;
-       }
-
-       for (Index_t i=0; i<numElem; ++i) {
-          Index_t *nl = nodelist(i) ;
-          for (Index_t j=0; j < 8; ++j) {
-             ++nodeElemCount(nl[j]);
-          }
-       }
-
-       m_nodeElemStart.reserve(numNode);
-
-       nodeElemStart(0)=0;
-
-       for (Index_t i=1; i < numNode; ++i) {
-          nodeElemStart(i) = nodeElemStart(i-1) + nodeElemCount(i-1) ;
-       }
-
-//       m_nodeElemList.reserve(nodeElemStart(numNode-1) +
-//                             nodeElemCount(numNode-1));
-
-       m_nodeElemCornerList.reserve(nodeElemStart(numNode-1) +
-                                   nodeElemCount(numNode-1));
-
-       for (Index_t i=0; i < numNode; ++i) {
-          nodeElemCount(i)=0;
-       }
-
-       for (Index_t i=0; i < numElem; ++i) {
-          Index_t *nl = nodelist(i) ;
-          for (Index_t j=0; j < 8; ++j) {
-             Index_t m = nl[j];
-             Index_t k = i*8 + j ;
-             Index_t offset = nodeElemStart(m)+nodeElemCount(m) ;
-//             nodeElemList(offset) = i;
-             nodeElemCornerList(offset) = k;
-             ++nodeElemCount(m);
-          }
-       }
-
-       Index_t clSize = m_nodeElemCornerList.size() ;
-       for (Index_t i=0; i < clSize; ++i) {
-          Index_t clv = nodeElemCornerList(i) ;
-          if ((clv < 0) || (clv > numElem*8)) {
-               fprintf(stderr,
-        "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-               exit(1);
-          }
-      }
-   }
-
-   
-   /**********/
-   /* Access */
-   /**********/
-
-   /* Node-centered */
-
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   Index_t& symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t& symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t& symmZ(Index_t idx) { return m_symmZ[idx] ; }
-
-   Index_t& nodeElemCount(Index_t idx) { return m_nodeElemCount[idx] ; }
-   Index_t& nodeElemStart(Index_t idx) { return m_nodeElemStart[idx] ; }
-//   Index_t& nodeElemList(Index_t idx)  { return m_nodeElemList[idx] ; }
-   Index_t& nodeElemCornerList(Index_t i) { return m_nodeElemCornerList[i] ; }
-
-   /* Element-centered */
-
-   Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-   Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-   
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   /* Params */
-
-   Real_t& dtfixed()              { return m_dtfixed ; }
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-
-   Real_t& u_cut()                { return m_u_cut ; }
-   Real_t& hgcoef()               { return m_hgcoef ; }
-   Real_t& qstop()                { return m_qstop ; }
-   Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }
-   Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }
-   Real_t& e_cut()                { return m_e_cut ; }
-   Real_t& p_cut()                { return m_p_cut ; }
-   Real_t& ss4o3()                { return m_ss4o3 ; }
-   Real_t& q_cut()                { return m_q_cut ; }
-   Real_t& v_cut()                { return m_v_cut ; }
-   Real_t& qlc_monoq()            { return m_qlc_monoq ; }
-   Real_t& qqc_monoq()            { return m_qqc_monoq ; }
-   Real_t& qqc()                  { return m_qqc ; }
-   Real_t& eosvmax()              { return m_eosvmax ; }
-   Real_t& eosvmin()              { return m_eosvmin ; }
-   Real_t& pmin()                 { return m_pmin ; }
-   Real_t& emin()                 { return m_emin ; }
-   Real_t& dvovmax()              { return m_dvovmax ; }
-   Real_t& refdens()              { return m_refdens ; }
-
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-
-private:
-
-   /******************/
-   /* Implementation */
-   /******************/
-
-   /* Node-centered */
-
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   std::vector<Index_t> m_nodeElemCount ;
-   std::vector<Index_t> m_nodeElemStart ;
-//   std::vector<Index_t> m_nodeElemList ;
-   std::vector<Index_t> m_nodeElemCornerList ;
-
-   /* Element-centered */
-
-   std::vector<Index_t>  m_matElemlist ;  /* material indexset */
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   /* Parameters */
-
-   Real_t  m_dtfixed ;           /* fixed time increment */
-   Real_t  m_time ;              /* current time */
-   Real_t  m_deltatime ;         /* variable time increment */
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_stoptime ;          /* end time for simulation */
-
-   Real_t  m_u_cut ;             /* velocity tolerance */
-   Real_t  m_hgcoef ;            /* hourglass control */
-   Real_t  m_qstop ;             /* excessive q indicator */
-   Real_t  m_monoq_max_slope ;
-   Real_t  m_monoq_limiter_mult ;
-   Real_t  m_e_cut ;             /* energy tolerance */
-   Real_t  m_p_cut ;             /* pressure tolerance */
-   Real_t  m_ss4o3 ;
-   Real_t  m_q_cut ;             /* q tolerance */
-   Real_t  m_v_cut ;             /* relative volume tolerance */
-   Real_t  m_qlc_monoq ;         /* linear term coef for q */
-   Real_t  m_qqc_monoq ;         /* quadratic term coef for q */
-   Real_t  m_qqc ;
-   Real_t  m_eosvmax ;
-   Real_t  m_eosvmin ;
-   Real_t  m_pmin ;              /* pressure floor */
-   Real_t  m_emin ;              /* energy floor */
-   Real_t  m_dvovmax ;           /* maximum allowable volume change */
-   Real_t  m_refdens ;           /* reference density */
-
-   Real_t  m_dtcourant ;         /* courant constraint */
-   Real_t  m_dthydro ;           /* volume change constraint */
-   Real_t  m_dtmax ;             /* maximum allowable time increment */
-
-   Int_t   m_cycle ;             /* iteration count for simulation */
-
-   Index_t   m_sizeX ;           /* X,Y,Z extent of this block */
-   Index_t   m_sizeY ;
-   Index_t   m_sizeZ ;
-
-   Index_t   m_numElem ;         /* Elements/Nodes in this domain */
-   Index_t   m_numNode ;
-} domain ;
-
-
-template <typename T>
-T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-/* Stuff needed for boundary conditions */
-/* 2 BCs on each of 6 hexahedral faces (12 bits) */
-#define XI_M        0x003
-#define XI_M_SYMM   0x001
-#define XI_M_FREE   0x002
-
-#define XI_P        0x00c
-#define XI_P_SYMM   0x004
-#define XI_P_FREE   0x008
-
-#define ETA_M       0x030
-#define ETA_M_SYMM  0x010
-#define ETA_M_FREE  0x020
-
-#define ETA_P       0x0c0
-#define ETA_P_SYMM  0x040
-#define ETA_P_FREE  0x080
-
-#define ZETA_M      0x300
-#define ZETA_M_SYMM 0x100
-#define ZETA_M_FREE 0x200
-
-#define ZETA_P      0xc00
-#define ZETA_P_SYMM 0x400
-#define ZETA_P_FREE 0x800
-
-
-static inline
-void TimeIncrement()
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t newdt = Real_t(1.0e+20) ;
-      if (domain.dtcourant() < newdt) {
-         newdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < newdt) {
-         newdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-static inline
-void InitStressTermsForElems(Index_t numElem, 
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i){
-      sigxx[i] =  sigyy[i] = sigzz[i] =  - domain.p(i) - domain.q(i) ;
-   }
-}
-
-static inline
-void CalcElemShapeFunctionDerivatives( const Real_t* const x,
-                                       const Real_t* const y,
-                                       const Real_t* const z,
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-static inline
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-static inline
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-static inline
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* const fx,
-                                  Real_t* const fy,
-                                  Real_t* const fz )
-{
-  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;
-  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;
-  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;
-  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;
-
-  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;
-  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;
-  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;
-  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;
-
-  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;
-  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;
-  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;
-  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;
-
-  fx[0] = -( stress_xx * pfx0 );
-  fx[1] = -( stress_xx * pfx1 );
-  fx[2] = -( stress_xx * pfx2 );
-  fx[3] = -( stress_xx * pfx3 );
-  fx[4] = -( stress_xx * pfx4 );
-  fx[5] = -( stress_xx * pfx5 );
-  fx[6] = -( stress_xx * pfx6 );
-  fx[7] = -( stress_xx * pfx7 );
-
-  fy[0] = -( stress_yy * pfy0  );
-  fy[1] = -( stress_yy * pfy1  );
-  fy[2] = -( stress_yy * pfy2  );
-  fy[3] = -( stress_yy * pfy3  );
-  fy[4] = -( stress_yy * pfy4  );
-  fy[5] = -( stress_yy * pfy5  );
-  fy[6] = -( stress_yy * pfy6  );
-  fy[7] = -( stress_yy * pfy7  );
-
-  fz[0] = -( stress_zz * pfz0 );
-  fz[1] = -( stress_zz * pfz1 );
-  fz[2] = -( stress_zz * pfz2 );
-  fz[3] = -( stress_zz * pfz3 );
-  fz[4] = -( stress_zz * pfz4 );
-  fz[5] = -( stress_zz * pfz5 );
-  fz[6] = -( stress_zz * pfz6 );
-  fz[7] = -( stress_zz * pfz7 );
-}
-
-static inline
-void IntegrateStressForElems( Index_t numElem,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ)
-{
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fy_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fz_elem = Allocate<Real_t>(numElem8) ;
-
-  // loop over all elements
-#pragma omp parallel for firstprivate(numElem)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    const Index_t* const elemNodes = domain.nodelist(k);
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      x_local[lnode] = domain.x(gnode);
-      y_local[lnode] = domain.y(gnode);
-      z_local[lnode] = domain.z(gnode);
-    }
-
-    /* Volume calculation involves extra work for numerical consistency. */
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                 &fx_elem[k*8], &fy_elem[k*8], &fz_elem[k*8] ) ;
-
-#if 0
-    // copy nodal force contributions to global force arrray.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemNodes[lnode];
-      domain.fx(gnode) += fx_local[lnode];
-      domain.fy(gnode) += fy_local[lnode];
-      domain.fz(gnode) += fz_local[lnode];
-    }
-#endif
-  }
-
-  {
-     Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-     for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-     {
-        Index_t count = domain.nodeElemCount(gnode) ;
-        Index_t start = domain.nodeElemStart(gnode) ;
-        Real_t fx = Real_t(0.0) ;
-        Real_t fy = Real_t(0.0) ;
-        Real_t fz = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t elem = domain.nodeElemCornerList(start+i) ;
-           fx += fx_elem[elem] ;
-           fy += fy_elem[elem] ;
-           fz += fz_elem[elem] ;
-        }
-        domain.fx(gnode) = fx ;
-        domain.fy(gnode) = fy ;
-        domain.fz(gnode) = fz ;
-     }
-  }
-
-  Release(&fz_elem) ;
-  Release(&fy_elem) ;
-  Release(&fx_elem) ;
-}
-
-
-static inline
-void CollectDomainNodesToElemNodes(const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain.x(nd0i);
-   elemX[1] = domain.x(nd1i);
-   elemX[2] = domain.x(nd2i);
-   elemX[3] = domain.x(nd3i);
-   elemX[4] = domain.x(nd4i);
-   elemX[5] = domain.x(nd5i);
-   elemX[6] = domain.x(nd6i);
-   elemX[7] = domain.x(nd7i);
-
-   elemY[0] = domain.y(nd0i);
-   elemY[1] = domain.y(nd1i);
-   elemY[2] = domain.y(nd2i);
-   elemY[3] = domain.y(nd3i);
-   elemY[4] = domain.y(nd4i);
-   elemY[5] = domain.y(nd5i);
-   elemY[6] = domain.y(nd6i);
-   elemY[7] = domain.y(nd7i);
-
-   elemZ[0] = domain.z(nd0i);
-   elemZ[1] = domain.z(nd1i);
-   elemZ[2] = domain.z(nd2i);
-   elemZ[3] = domain.z(nd3i);
-   elemZ[4] = domain.z(nd4i);
-   elemZ[5] = domain.z(nd5i);
-   elemZ[6] = domain.z(nd6i);
-   elemZ[7] = domain.z(nd7i);
-
-}
-
-static inline
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-static inline
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-static inline
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,
-                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,
-                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,
-                              Real_t *hourgam7, Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Index_t i00=0;
-   Index_t i01=1;
-   Index_t i02=2;
-   Index_t i03=3;
-
-   Real_t h00 =
-      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +
-      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +
-      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +
-      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];
-
-   Real_t h01 =
-      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +
-      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +
-      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +
-      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];
-
-   Real_t h02 =
-      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+
-      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+
-      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+
-      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];
-
-   Real_t h03 =
-      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +
-      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +
-      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +
-      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];
-
-   hgfx[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfx[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfx[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfx[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfx[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfx[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfx[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfx[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +
-      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +
-      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +
-      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];
-
-   h01 =
-      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +
-      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +
-      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +
-      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];
-
-   h02 =
-      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+
-      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+
-      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+
-      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];
-
-   h03 =
-      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +
-      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +
-      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +
-      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];
-
-
-   hgfy[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfy[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfy[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfy[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfy[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfy[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfy[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfy[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-
-   h00 =
-      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +
-      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +
-      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +
-      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];
-
-   h01 =
-      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +
-      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +
-      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +
-      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];
-
-   h02 =
-      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+
-      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+
-      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+
-      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];
-
-   h03 =
-      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +
-      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +
-      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +
-      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];
-
-
-   hgfz[0] = coefficient *
-      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +
-       hourgam0[i02] * h02 + hourgam0[i03] * h03);
-
-   hgfz[1] = coefficient *
-      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +
-       hourgam1[i02] * h02 + hourgam1[i03] * h03);
-
-   hgfz[2] = coefficient *
-      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +
-       hourgam2[i02] * h02 + hourgam2[i03] * h03);
-
-   hgfz[3] = coefficient *
-      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +
-       hourgam3[i02] * h02 + hourgam3[i03] * h03);
-
-   hgfz[4] = coefficient *
-      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +
-       hourgam4[i02] * h02 + hourgam4[i03] * h03);
-
-   hgfz[5] = coefficient *
-      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +
-       hourgam5[i02] * h02 + hourgam5[i03] * h03);
-
-   hgfz[6] = coefficient *
-      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +
-       hourgam6[i02] * h02 + hourgam6[i03] * h03);
-
-   hgfz[7] = coefficient *
-      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +
-       hourgam7[i02] * h02 + hourgam7[i03] * h03);
-}
-
-static inline
-void CalcFBHourglassForceForElems(Real_t *determ,
-            Real_t *x8n,      Real_t *y8n,      Real_t *z8n,
-            Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,
-            Real_t hourg)
-{
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-
-   Index_t numElem = domain.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fy_elem = Allocate<Real_t>(numElem8) ;
-   Real_t *fz_elem = Allocate<Real_t>(numElem8) ;
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-#pragma omp parallel for firstprivate(numElem, hourg) 
-   for(Index_t i2=0; i2<numElem; ++i2){
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;
-      Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = domain.nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain.ss(i2);
-      mass1=domain.elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain.xd(n0si2);
-      xd1[1] = domain.xd(n1si2);
-      xd1[2] = domain.xd(n2si2);
-      xd1[3] = domain.xd(n3si2);
-      xd1[4] = domain.xd(n4si2);
-      xd1[5] = domain.xd(n5si2);
-      xd1[6] = domain.xd(n6si2);
-      xd1[7] = domain.xd(n7si2);
-
-      yd1[0] = domain.yd(n0si2);
-      yd1[1] = domain.yd(n1si2);
-      yd1[2] = domain.yd(n2si2);
-      yd1[3] = domain.yd(n3si2);
-      yd1[4] = domain.yd(n4si2);
-      yd1[5] = domain.yd(n5si2);
-      yd1[6] = domain.yd(n6si2);
-      yd1[7] = domain.yd(n7si2);
-
-      zd1[0] = domain.zd(n0si2);
-      zd1[1] = domain.zd(n1si2);
-      zd1[2] = domain.zd(n2si2);
-      zd1[3] = domain.zd(n3si2);
-      zd1[4] = domain.zd(n4si2);
-      zd1[5] = domain.zd(n5si2);
-      zd1[6] = domain.zd(n6si2);
-      zd1[7] = domain.zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam0,hourgam1,hourgam2,hourgam3,
-                      hourgam4,hourgam5,hourgam6,hourgam7,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      fx_local = &fx_elem[i3] ;
-      fx_local[0] = hgfx[0];
-      fx_local[1] = hgfx[1];
-      fx_local[2] = hgfx[2];
-      fx_local[3] = hgfx[3];
-      fx_local[4] = hgfx[4];
-      fx_local[5] = hgfx[5];
-      fx_local[6] = hgfx[6];
-      fx_local[7] = hgfx[7];
-
-      fy_local = &fy_elem[i3] ;
-      fy_local[0] = hgfy[0];
-      fy_local[1] = hgfy[1];
-      fy_local[2] = hgfy[2];
-      fy_local[3] = hgfy[3];
-      fy_local[4] = hgfy[4];
-      fy_local[5] = hgfy[5];
-      fy_local[6] = hgfy[6];
-      fy_local[7] = hgfy[7];
-
-      fz_local = &fz_elem[i3] ;
-      fz_local[0] = hgfz[0];
-      fz_local[1] = hgfz[1];
-      fz_local[2] = hgfz[2];
-      fz_local[3] = hgfz[3];
-      fz_local[4] = hgfz[4];
-      fz_local[5] = hgfz[5];
-      fz_local[6] = hgfz[6];
-      fz_local[7] = hgfz[7];
-
-#if 0
-      domain.fx(n0si2) += hgfx[0];
-      domain.fy(n0si2) += hgfy[0];
-      domain.fz(n0si2) += hgfz[0];
-
-      domain.fx(n1si2) += hgfx[1];
-      domain.fy(n1si2) += hgfy[1];
-      domain.fz(n1si2) += hgfz[1];
-
-      domain.fx(n2si2) += hgfx[2];
-      domain.fy(n2si2) += hgfy[2];
-      domain.fz(n2si2) += hgfz[2];
-
-      domain.fx(n3si2) += hgfx[3];
-      domain.fy(n3si2) += hgfy[3];
-      domain.fz(n3si2) += hgfz[3];
-
-      domain.fx(n4si2) += hgfx[4];
-      domain.fy(n4si2) += hgfy[4];
-      domain.fz(n4si2) += hgfz[4];
-
-      domain.fx(n5si2) += hgfx[5];
-      domain.fy(n5si2) += hgfy[5];
-      domain.fz(n5si2) += hgfz[5];
-
-      domain.fx(n6si2) += hgfx[6];
-      domain.fy(n6si2) += hgfy[6];
-      domain.fz(n6si2) += hgfz[6];
-
-      domain.fx(n7si2) += hgfx[7];
-      domain.fy(n7si2) += hgfy[7];
-      domain.fz(n7si2) += hgfz[7];
-#endif
-   }
-
-  {
-     Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-     for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-     {
-        Index_t count = domain.nodeElemCount(gnode) ;
-        Index_t start = domain.nodeElemStart(gnode) ;
-        Real_t fx = Real_t(0.0) ;
-        Real_t fy = Real_t(0.0) ;
-        Real_t fz = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t elem = domain.nodeElemCornerList(start+i) ;
-           fx += fx_elem[elem] ;
-           fy += fy_elem[elem] ;
-           fz += fz_elem[elem] ;
-        }
-        domain.fx(gnode) += fx ;
-        domain.fy(gnode) += fy ;
-        domain.fz(gnode) += fz ;
-     }
-  }
-
-  Release(&fz_elem) ;
-  Release(&fy_elem) ;
-  Release(&fx_elem) ;
-}
-
-static inline
-void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i=0 ; i<numElem ; ++i){
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain.nodelist(i);
-      CollectDomainNodesToElemNodes(elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii){
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain.volo(i) * domain.v(i);
-
-      /* Do a check for negative volumes */
-      if ( domain.v(i) <= Real_t(0.0) ) {
-         exit(VolumeError) ;
-      }
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-static inline
-void CalcVolumeForceForElems()
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain.hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(numElem, sigxx, sigyy, sigzz);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ) ;
-
-      // check for negative element volume
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k ) {
-         if (determ[k] <= Real_t(0.0)) {
-            exit(VolumeError) ;
-         }
-      }
-
-      CalcHourglassControlForElems(determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-static inline void CalcForceForNodes()
-{
-  Index_t numNode = domain.numNode() ;
-#pragma omp parallel for firstprivate(numNode)
-  for (Index_t i=0; i<numNode; ++i) {
-     domain.fx(i) = Real_t(0.0) ;
-     domain.fy(i) = Real_t(0.0) ;
-     domain.fz(i) = Real_t(0.0) ;
-  }
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems() ;
-
-  /* Calculate Nodal Forces at domain boundaries */
-  /* problem->commSBN->Transfer(CommSBN::forces); */
-
-}
-
-static inline
-void CalcAccelerationForNodes()
-{
-   Index_t numNode = domain.numNode() ;
-#pragma omp parallel for firstprivate(numNode)
-   for (Index_t i = 0; i < numNode; ++i) {
-      domain.xdd(i) = domain.fx(i) / domain.nodalMass(i);
-      domain.ydd(i) = domain.fy(i) / domain.nodalMass(i);
-      domain.zdd(i) = domain.fz(i) / domain.nodalMass(i);
-   }
-}
-
-
-static inline
-void ApplyAccelerationBoundaryConditionsForNodes()
-{
-  Index_t numNodeBC = (domain.sizeX()+1)*(domain.sizeX()+1) ;
- 
-#pragma omp parallel
-{
-#pragma omp for nowait firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.xdd(domain.symmX(i)) = Real_t(0.0) ;
-
-#pragma omp for nowait firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.ydd(domain.symmY(i)) = Real_t(0.0) ;
-
-#pragma omp for firstprivate(numNodeBC)
-  for(Index_t i=0 ; i<numNodeBC ; ++i)
-    domain.zdd(domain.symmZ(i)) = Real_t(0.0) ;
-}
-}
-
-static inline
-void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut)
-{
-   Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain.xd(i) + domain.xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain.xd(i) = xdtmp ;
-
-     ydtmp = domain.yd(i) + domain.ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain.yd(i) = ydtmp ;
-
-     zdtmp = domain.zd(i) + domain.zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain.zd(i) = zdtmp ;
-   }
-}
-
-static inline
-void CalcPositionForNodes(const Real_t dt)
-{
-   Index_t numNode = domain.numNode() ;
-
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     domain.x(i) += domain.xd(i) * dt ;
-     domain.y(i) += domain.yd(i) * dt ;
-     domain.z(i) += domain.zd(i) * dt ;
-   }
-}
-
-static inline
-void LagrangeNodal()
-{
-  const Real_t delt = domain.deltatime() ;
-  Real_t u_cut = domain.u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes();
-
-  CalcAccelerationForNodes();
-
-  ApplyAccelerationBoundaryConditionsForNodes();
-
-  CalcVelocityForNodes( delt, u_cut ) ;
-
-  CalcPositionForNodes( delt );
-
-  return;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-static inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-static inline
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-static inline
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-static inline
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                               const Real_t* const yvel,
-                               const Real_t* const zvel,
-                               const Real_t b[][8],
-                               const Real_t detJ,
-                               Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-static inline
-void CalcKinematicsForElems( Index_t numElem, Real_t dt )
-{
-  // loop over all elements
-#pragma omp parallel for firstprivate(numElem, dt)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-     Real_t B[3][8] ; /** shape function derivatives */
-     Real_t D[6] ;
-     Real_t x_local[8] ;
-     Real_t y_local[8] ;
-     Real_t z_local[8] ;
-     Real_t xd_local[8] ;
-     Real_t yd_local[8] ;
-     Real_t zd_local[8] ;
-     Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain.nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      x_local[lnode] = domain.x(gnode);
-      y_local[lnode] = domain.y(gnode);
-      z_local[lnode] = domain.z(gnode);
-    }
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain.volo(k) ;
-    domain.vnew(k) = relativeVolume ;
-    domain.delv(k) = relativeVolume - domain.v(k) ;
-
-    // set characteristic length
-    domain.arealg(k) = CalcElemCharacteristicLength(x_local,
-                                                  y_local,
-                                                  z_local,
-                                                  volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain.xd(gnode);
-      yd_local[lnode] = domain.yd(gnode);
-      zd_local[lnode] = domain.zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * dt;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local,
-                                          y_local,
-                                          z_local,
-                                          B, &detJ );
-
-    CalcElemVelocityGradient( xd_local,
-                              yd_local,
-                              zd_local,
-                              B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain.dxx(k) = D[0];
-    domain.dyy(k) = D[1];
-    domain.dzz(k) = D[2];
-  }
-}
-
-static inline
-void CalcLagrangeElements(Real_t deltatime)
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem > 0) {
-      CalcKinematicsForElems(numElem, deltatime) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k )
-      {
-        // calc strain rate and apply as constraint (only done in FB element)
-        Real_t vdov = domain.dxx(k) + domain.dyy(k) + domain.dzz(k) ;
-        Real_t vdovthird = vdov/Real_t(3.0) ;
-        
-        // make the rate of deformation tensor deviatoric
-        domain.vdov(k) = vdov ;
-        domain.dxx(k) -= vdovthird ;
-        domain.dyy(k) -= vdovthird ;
-        domain.dzz(k) -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-        if (domain.vnew(k) <= Real_t(0.0))
-        {
-           exit(VolumeError) ;
-        }
-      }
-   }
-}
-
-static inline
-void CalcMonotonicQGradientsForElems()
-{
-#define SUM4(a,b,c,d) (a + b + c + d)
-   Index_t numElem = domain.numElem() ;
-
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i ) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain.nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain.x(n0) ;
-      Real_t x1 = domain.x(n1) ;
-      Real_t x2 = domain.x(n2) ;
-      Real_t x3 = domain.x(n3) ;
-      Real_t x4 = domain.x(n4) ;
-      Real_t x5 = domain.x(n5) ;
-      Real_t x6 = domain.x(n6) ;
-      Real_t x7 = domain.x(n7) ;
-
-      Real_t y0 = domain.y(n0) ;
-      Real_t y1 = domain.y(n1) ;
-      Real_t y2 = domain.y(n2) ;
-      Real_t y3 = domain.y(n3) ;
-      Real_t y4 = domain.y(n4) ;
-      Real_t y5 = domain.y(n5) ;
-      Real_t y6 = domain.y(n6) ;
-      Real_t y7 = domain.y(n7) ;
-
-      Real_t z0 = domain.z(n0) ;
-      Real_t z1 = domain.z(n1) ;
-      Real_t z2 = domain.z(n2) ;
-      Real_t z3 = domain.z(n3) ;
-      Real_t z4 = domain.z(n4) ;
-      Real_t z5 = domain.z(n5) ;
-      Real_t z6 = domain.z(n6) ;
-      Real_t z7 = domain.z(n7) ;
-
-      Real_t xv0 = domain.xd(n0) ;
-      Real_t xv1 = domain.xd(n1) ;
-      Real_t xv2 = domain.xd(n2) ;
-      Real_t xv3 = domain.xd(n3) ;
-      Real_t xv4 = domain.xd(n4) ;
-      Real_t xv5 = domain.xd(n5) ;
-      Real_t xv6 = domain.xd(n6) ;
-      Real_t xv7 = domain.xd(n7) ;
-
-      Real_t yv0 = domain.yd(n0) ;
-      Real_t yv1 = domain.yd(n1) ;
-      Real_t yv2 = domain.yd(n2) ;
-      Real_t yv3 = domain.yd(n3) ;
-      Real_t yv4 = domain.yd(n4) ;
-      Real_t yv5 = domain.yd(n5) ;
-      Real_t yv6 = domain.yd(n6) ;
-      Real_t yv7 = domain.yd(n7) ;
-
-      Real_t zv0 = domain.zd(n0) ;
-      Real_t zv1 = domain.zd(n1) ;
-      Real_t zv2 = domain.zd(n2) ;
-      Real_t zv3 = domain.zd(n3) ;
-      Real_t zv4 = domain.zd(n4) ;
-      Real_t zv5 = domain.zd(n5) ;
-      Real_t zv6 = domain.zd(n6) ;
-      Real_t zv7 = domain.zd(n7) ;
-
-      Real_t vol = domain.volo(i)*domain.vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;
-      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;
-      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;
-      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;
-      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;
-      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;
-      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;
-      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;
-      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;
-
-      domain.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;
-      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;
-      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;
-
-      domain.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;
-      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;
-      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;
-
-      domain.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   }
-#undef SUM4
-}
-
-static inline
-void CalcMonotonicQRegionForElems(// parameters
-                          Real_t qlc_monoq,
-                          Real_t qqc_monoq,
-                          Real_t monoq_limiter_mult,
-                          Real_t monoq_max_slope,
-                          Real_t ptiny,
-
-                          // the elementset length
-                          Index_t elength )
-{
-#pragma omp parallel for firstprivate(elength, qlc_monoq, qqc_monoq, monoq_limiter_mult, monoq_max_slope, ptiny)
-   for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Index_t i = domain.matElemlist(ielem);
-      Int_t bcMask = domain.elemBC(i) ;
-      Real_t delvm, delvp ;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / ( domain.delv_xi(i) + ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case 0:         delvm = domain.delv_xi(domain.lxim(i)) ; break ;
-         case XI_M_SYMM: delvm = domain.delv_xi(i) ;            break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-      switch (bcMask & XI_P) {
-         case 0:         delvp = domain.delv_xi(domain.lxip(i)) ; break ;
-         case XI_P_SYMM: delvp = domain.delv_xi(i) ;            break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;                break ;
-         default:        /* ERROR */ ;                        break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain.delv_eta(i) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case 0:          delvm = domain.delv_eta(domain.letam(i)) ; break ;
-         case ETA_M_SYMM: delvm = domain.delv_eta(i) ;             break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-      switch (bcMask & ETA_P) {
-         case 0:          delvp = domain.delv_eta(domain.letap(i)) ; break ;
-         case ETA_P_SYMM: delvp = domain.delv_eta(i) ;             break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;
-         default:         /* ERROR */ ;                          break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain.delv_zeta(i) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case 0:           delvm = domain.delv_zeta(domain.lzetam(i)) ; break ;
-         case ZETA_M_SYMM: delvm = domain.delv_zeta(i) ;              break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-      switch (bcMask & ZETA_P) {
-         case 0:           delvp = domain.delv_zeta(domain.lzetap(i)) ; break ;
-         case ZETA_P_SYMM: delvp = domain.delv_zeta(i) ;              break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;
-         default:          /* ERROR */ ;                            break ;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain.vdov(i) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain.delv_xi(i)   * domain.delx_xi(i)   ;
-         Real_t delvxeta  = domain.delv_eta(i)  * domain.delx_eta(i)  ;
-         Real_t delvxzeta = domain.delv_zeta(i) * domain.delx_zeta(i) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain.elemMass(i) / (domain.volo(i) * domain.vnew(i)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain.qq(i) = qquad ;
-      domain.ql(i) = qlin  ;
-   }
-}
-
-static inline
-void CalcMonotonicQForElems()
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny        = Real_t(1.e-36) ;
-   Real_t monoq_max_slope    = domain.monoq_max_slope() ;
-   Real_t monoq_limiter_mult = domain.monoq_limiter_mult() ;
-
-   //
-   // calculate the monotonic q for pure regions
-   //
-   Index_t elength = domain.numElem() ;
-   if (elength > 0) {
-      Real_t qlc_monoq = domain.qlc_monoq();
-      Real_t qqc_monoq = domain.qqc_monoq();
-      CalcMonotonicQRegionForElems(// parameters
-                           qlc_monoq,
-                           qqc_monoq,
-                           monoq_limiter_mult,
-                           monoq_max_slope,
-                           ptiny,
-
-                           // the elemset length
-                           elength );
-   }
-}
-
-static inline
-void CalcQForElems()
-{
-   Real_t qstop = domain.qstop() ;
-   Index_t numElem = domain.numElem() ;
-
-   //
-   // MONOTONIC Q option
-   //
-
-   /* Calculate velocity gradients */
-   CalcMonotonicQGradientsForElems() ;
-
-   /* Transfer veloctiy gradients in the first order elements */
-   /* problem->commElements->Transfer(CommElements::monoQ) ; */
-   CalcMonotonicQForElems() ;
-
-   /* Don't allow excessive artificial viscosity */
-   if (numElem != 0) {
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain.q(i) > qstop ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-         exit(QStopError) ;
-      }
-   }
-}
-
-static inline
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length)
-{
-
-#pragma omp parallel for firstprivate(length)
-   for (Index_t i = 0; i < length ; ++i) {
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      Index_t iz = domain.matElemlist(i);
-      bvc[iz] = c1s * (compression[iz] + Real_t(1.));
-      pbvc[iz] = c1s;
-   }
-
-#pragma omp parallel for firstprivate(length, pmin, p_cut, eosvmax)
-   for (Index_t i = 0 ; i < length ; ++i){
-      Index_t iz = domain.matElemlist(i);
-      p_new[iz] = bvc[iz] * e_old[iz] ;
-
-      if    (FABS(p_new[iz]) <  p_cut   )
-         p_new[iz] = Real_t(0.0) ;
-
-      if    ( vnewc[iz] >= eosvmax ) /* impossible condition here? */
-         p_new[iz] = Real_t(0.0) ;
-
-      if    (p_new[iz]       <  pmin)
-         p_new[iz]   = pmin ;
-   }
-}
-
-static inline
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc, Real_t* p_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t rho0, Real_t eosvmax, Index_t length)
-{
-   /* allocate domain length array */
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-#pragma omp parallel for firstprivate(length, emin)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t iz = domain.matElemlist(i);
-      e_new[iz] = domain.e(iz)
-         - Real_t(0.5) * domain.delv(iz) * (p_old[iz] + domain.q(iz))
-         + Real_t(0.5) * work[iz];
-
-      if (e_new[iz]  < emin ) {
-         e_new[iz] = emin ;
-      }
-   }
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t iz = domain.matElemlist(i);
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[iz]) ;
-
-      if ( domain.delv(iz) > Real_t(0.) ) {
-         q_new[iz] /* = qq[iz] = ql[iz] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[iz] * e_new[iz]
-                 + vhalf * vhalf * bvc[iz] * pHalfStep[iz] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc =Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[iz] = (ssc*domain.ql(iz) + domain.qq(iz)) ;
-      }
-
-      e_new[iz] = e_new[iz] + Real_t(0.5) * domain.delv(iz)
-         * (  Real_t(3.0)*(p_old[iz]     + domain.q(iz))
-              - Real_t(4.0)*(pHalfStep[iz] + q_new[iz])) ;
-   }
-
-#pragma omp parallel for firstprivate(length, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t iz = domain.matElemlist(i);
-      e_new[iz] += Real_t(0.5) * work[iz];
-
-      if (FABS(e_new[iz]) < e_cut) {
-         e_new[iz] = Real_t(0.)  ;
-      }
-      if (     e_new[iz]  < emin ) {
-         e_new[iz] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Real_t q_tilde ;
-
-      Index_t iz = domain.matElemlist(i);
-      if (domain.delv(iz) > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[iz] * e_new[iz]
-                 + vnewc[iz] * vnewc[iz] * bvc[iz] * p_new[iz] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*domain.ql(iz) + domain.qq(iz)) ;
-      }
-
-      e_new[iz] = e_new[iz] - (  Real_t(7.0)*(p_old[iz]     + domain.q(iz))
-                               - Real_t(8.0)*(pHalfStep[iz] + q_new[iz])
-                          + (p_new[iz] + q_tilde)) * domain.delv(iz)*sixth ;
-
-      if (FABS(e_new[iz]) < e_cut) {
-         e_new[iz] = Real_t(0.)  ;
-      }
-      if ( e_new[iz]  < emin ) {
-         e_new[iz] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                   pmin, p_cut, eosvmax, length);
-
-#pragma omp parallel for firstprivate(length, rho0, q_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-
-      Index_t iz = domain.matElemlist(i);
-      if ( domain.delv(iz) <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[iz] * e_new[iz]
-                 + vnewc[iz] * vnewc[iz] * bvc[iz] * p_new[iz] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[iz] = (ssc*domain.ql(iz) + domain.qq(iz)) ;
-
-         if (FABS(q_new[iz]) < q_cut) q_new[iz] = Real_t(0.) ;
-      }
-   }
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-static inline
-void CalcSoundSpeedForElems(Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3, Index_t nz)
-{
-#pragma omp parallel for firstprivate(nz, rho0, ss4o3)
-   for (Index_t i = 0; i < nz ; ++i) {
-      Index_t iz = domain.matElemlist(i);
-      Real_t ssTmp = (pbvc[iz] * enewc[iz] + vnewc[iz] * vnewc[iz] *
-                 bvc[iz] * pnewc[iz]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp) ;
-      }
-      domain.ss(iz) = ssTmp ;
-   }
-}
-
-static inline
-void EvalEOSForElems(Real_t *vnewc, Index_t length)
-{
-   Real_t  e_cut = domain.e_cut();
-   Real_t  p_cut = domain.p_cut();
-   Real_t  ss4o3 = domain.ss4o3();
-   Real_t  q_cut = domain.q_cut();
-
-   Real_t eosvmax = domain.eosvmax() ;
-   Real_t eosvmin = domain.eosvmin() ;
-   Real_t pmin    = domain.pmin() ;
-   Real_t emin    = domain.emin() ;
-   Real_t rho0    = domain.refdens() ;
-
-   /* Allocate *domain length* arrays */
-   Real_t *p_old = Allocate<Real_t>(length) ;
-   Real_t *compression = Allocate<Real_t>(length) ;
-   Real_t *compHalfStep = Allocate<Real_t>(length) ;
-   Real_t *work = Allocate<Real_t>(length) ;
-   Real_t *p_new = Allocate<Real_t>(length) ;
-   Real_t *e_new = Allocate<Real_t>(length) ;
-   Real_t *q_new = Allocate<Real_t>(length) ;
-   Real_t *bvc = Allocate<Real_t>(length) ;
-   Real_t *pbvc = Allocate<Real_t>(length) ;
-
-   /* compress data, minimal set */
-#pragma omp parallel
-   {
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t iz = domain.matElemlist(i) ;
-         p_old[iz] = domain.p(iz) ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i = 0; i < length ; ++i) {
-         Real_t vchalf ;
-         Index_t iz = domain.matElemlist(i) ;
-         compression[iz] = Real_t(1.) / vnewc[iz] - Real_t(1.);
-         vchalf = vnewc[iz] - domain.delv(iz) * Real_t(.5);
-         compHalfStep[iz] = Real_t(1.) / vchalf - Real_t(1.);
-      }
-
-   /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(length,eosvmin)
-         for(Index_t i=0 ; i<length ; ++i) {
-            Index_t iz = domain.matElemlist(i) ;
-            if (vnewc[iz] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[iz] = compression[iz] ;
-            }
-         }
-      }
-      if ( eosvmax != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(length,eosvmax)
-         for(Index_t i=0 ; i<length ; ++i) {
-            Index_t iz = domain.matElemlist(i) ;
-            if (vnewc[iz] >= eosvmax) { /* impossible due to calling func? */
-               p_old[iz]        = Real_t(0.) ;
-               compression[iz]  = Real_t(0.) ;
-               compHalfStep[iz] = Real_t(0.) ;
-            }
-         }
-      }
-
-#pragma omp for firstprivate(length)
-      for (Index_t i = 0 ; i < length ; ++i) {
-         Index_t iz = domain.matElemlist(i) ;
-         work[iz] = Real_t(0.) ; 
-      }
-   }
-
-   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                 p_old, compression, compHalfStep,
-                 vnewc, work, pmin,
-                 p_cut, e_cut, q_cut, emin,
-                 rho0, eosvmax, length);
-
-
-#pragma omp parallel
-   {
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t iz = domain.matElemlist(i) ;
-         domain.p(iz) = p_new[iz] ;
-      }
-
-#pragma omp for nowait firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t iz = domain.matElemlist(i) ;
-         domain.e(iz) = e_new[iz] ;
-      }
-
-#pragma omp for firstprivate(length)
-      for (Index_t i=0; i<length; ++i) {
-         Index_t iz = domain.matElemlist(i) ;
-         domain.q(iz) = q_new[iz] ;
-      }
-   }
-
-   CalcSoundSpeedForElems(vnewc, rho0, e_new, p_new,
-             pbvc, bvc, ss4o3, length) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&p_old) ;
-}
-
-static inline
-void ApplyMaterialPropertiesForElems()
-{
-  Index_t length = domain.numElem() ;
-
-  if (length != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain.eosvmin() ;
-    Real_t eosvmax = domain.eosvmax() ;
-    /* Allocate *domain length* array */
-    Real_t *vnewc = Allocate<Real_t>(length) ;
-
-#pragma omp parallel
-    {
-#pragma omp for nowait firstprivate(length)
-       for (Index_t i=0 ; i<length ; ++i) {
-          Index_t iz = domain.matElemlist(i) ;
-          vnewc[iz] = domain.vnew(iz) ;
-       }
-
-       if (eosvmin != Real_t(0.)) {
-#pragma omp for nowait firstprivate(length,eosvmin)
-          for(Index_t i=0 ; i<length ; ++i) {
-             Index_t iz = domain.matElemlist(i) ;
-             if (vnewc[iz] < eosvmin)
-                vnewc[iz] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-#pragma omp for nowait firstprivate(length,eosvmax)
-          for(Index_t i=0 ; i<length ; ++i) {
-             Index_t iz = domain.matElemlist(i) ;
-             if (vnewc[iz] > eosvmax)
-                vnewc[iz] = eosvmax ;
-          }
-       }
-
-#pragma omp for firstprivate(length,eosvmin,eosvmax)
-       for (Index_t i=0; i<length; ++i) {
-          Index_t iz = domain.matElemlist(i) ;
-          Real_t vc = domain.v(iz) ;
-          if (eosvmin != Real_t(0.)) {
-             if (vc < eosvmin)
-                vc = eosvmin ;
-          }
-          if (eosvmax != Real_t(0.)) {
-             if (vc > eosvmax)
-                vc = eosvmax ;
-          }
-          if (vc <= 0.) {
-             exit(VolumeError) ;
-          }
-       }
-    }
-
-    EvalEOSForElems(vnewc, length);
-
-    Release(&vnewc) ;
-  }
-}
-
-static inline
-void UpdateVolumesForElems()
-{
-   Index_t numElem = domain.numElem();
-   if (numElem != 0) {
-      Real_t v_cut = domain.v_cut();
-
-#pragma omp parallel for firstprivate(numElem,v_cut)
-      for(Index_t i=0 ; i<numElem ; ++i) {
-         Real_t tmpV ;
-         tmpV = domain.vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-         domain.v(i) = tmpV ;
-      }
-   }
-
-   return ;
-}
-
-static inline
-void LagrangeElements()
-{
-  const Real_t deltatime = domain.deltatime() ;
-
-  CalcLagrangeElements(deltatime) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems() ;
-
-  ApplyMaterialPropertiesForElems() ;
-
-  UpdateVolumesForElems() ;
-}
-
-static inline
-void CalcCourantConstraintForElems()
-{
-   Real_t dtcourant = Real_t(1.0e+20) ;
-   Index_t   courant_elem = -1 ;
-   Real_t      qqc = domain.qqc() ;
-   Index_t length = domain.numElem() ;
-
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-#pragma omp parallel for firstprivate(length,qqc2), shared(dtcourant,courant_elem)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = domain.matElemlist(i) ;
-
-      Real_t dtf = domain.ss(indx) * domain.ss(indx) ;
-
-      if ( domain.vdov(indx) < Real_t(0.) ) {
-
-         dtf = dtf
-            + qqc2 * domain.arealg(indx) * domain.arealg(indx)
-            * domain.vdov(indx) * domain.vdov(indx) ;
-      }
-
-      dtf = SQRT(dtf) ;
-
-      dtf = domain.arealg(indx) / dtf ;
-
-   /* determine minimum timestep with its corresponding elem */
-      if (domain.vdov(indx) != Real_t(0.)) {
-#pragma omp critical
-         {
-            if ( dtf < dtcourant ) {
-               dtcourant = dtf ;
-               courant_elem = indx ;
-            }
-         }
-      }
-   }
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (courant_elem != -1) {
-      domain.dtcourant() = dtcourant ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcHydroConstraintForElems()
-{
-   Real_t dthydro = Real_t(1.0e+20) ;
-   Index_t hydro_elem = -1 ;
-   Real_t dvovmax = domain.dvovmax() ;
-   Index_t length = domain.numElem() ;
-
-#pragma omp parallel for firstprivate(length), shared(dthydro,hydro_elem)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Index_t indx = domain.matElemlist(i) ;
-
-      if (domain.vdov(indx) != Real_t(0.)) {
-         Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx))+Real_t(1.e-20)) ;
-#pragma omp critical
-         {
-            if ( dthydro > dtdvov ) {
-               dthydro = dtdvov ;
-               hydro_elem = indx ;
-            }
-         }
-      }
-   }
-
-   if (hydro_elem != -1) {
-      domain.dthydro() = dthydro ;
-   }
-
-   return ;
-}
-
-static inline
-void CalcTimeConstraintsForElems() {
-   /* evaluate time constraint */
-   CalcCourantConstraintForElems() ;
-
-   /* check hydro constraint */
-   CalcHydroConstraintForElems() ;
-}
-
-static inline
-void LagrangeLeapFrog()
-{
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal();
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements();
-
-   CalcTimeConstraintsForElems();
-
-   // LagrangeRelease() ;  Creation/destruction of temps may be important to capture 
-}
-
-int main(int argc, char *argv[])
-{
-   RAJA::Timer timer_main;
-   RAJA::Timer timer_cycle;
-
-   timer_main.start("timer_main");
-
-
-   Index_t edgeElems = 45 ;
-   Index_t edgeNodes = edgeElems+1 ;
-   // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */
-   Real_t tx, ty, tz ;
-   Index_t nidx, zidx ;
-   Index_t domElems, domNodes ;
-
-   /* get run options to measure various metrics */
-
-   /* ... */
-
-   /****************************/
-   /*   Initialize Sedov Mesh  */
-   /****************************/
-
-   /* construct a uniform box for this processor */
-
-   domain.sizeX()   = edgeElems ;
-   domain.sizeY()   = edgeElems ;
-   domain.sizeZ()   = edgeElems ;
-   domain.numElem() = edgeElems*edgeElems*edgeElems ;
-   domain.numNode() = edgeNodes*edgeNodes*edgeNodes ;
-
-   domElems = domain.numElem() ;
-   domNodes = domain.numNode() ;
-
-   /* allocate field memory */
-
-   domain.AllocateElemPersistent(domain.numElem()) ;
-   domain.AllocateElemTemporary (domain.numElem()) ;
-
-   domain.AllocateNodalPersistent(domain.numNode()) ;
-   domain.AllocateNodesets(edgeNodes*edgeNodes) ;
-
-   /* Basic Field Initialization */
-
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.e(i) = Real_t(0.0) ;
-      domain.p(i) = Real_t(0.0) ;
-      domain.q(i) = Real_t(0.0) ;
-      domain.v(i) = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xd(i) = Real_t(0.0) ;
-      domain.yd(i) = Real_t(0.0) ;
-      domain.zd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.xdd(i) = Real_t(0.0) ;
-      domain.ydd(i) = Real_t(0.0) ;
-      domain.zdd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<domNodes; ++i) {
-      domain.nodalMass(i) = Real_t(0.0) ;
-   }
-
-   /* initialize nodal coordinates */
-
-   nidx = 0 ;
-   tz  = Real_t(0.) ;
-   for (Index_t plane=0; plane<edgeNodes; ++plane) {
-      ty = Real_t(0.) ;
-      for (Index_t row=0; row<edgeNodes; ++row) {
-         tx = Real_t(0.) ;
-         for (Index_t col=0; col<edgeNodes; ++col) {
-            domain.x(nidx) = tx ;
-            domain.y(nidx) = ty ;
-            domain.z(nidx) = tz ;
-            ++nidx ;
-            // tx += ds ; /* may accumulate roundoff... */
-            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;
-         }
-         // ty += ds ;  /* may accumulate roundoff... */
-         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;
-      }
-      // tz += ds ;  /* may accumulate roundoff... */
-      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;
-   }
-
-
-   /* embed hexehedral elements in nodal point lattice */
-
-   nidx = 0 ;
-   zidx = 0 ;
-   for (Index_t plane=0; plane<edgeElems; ++plane) {
-      for (Index_t row=0; row<edgeElems; ++row) {
-         for (Index_t col=0; col<edgeElems; ++col) {
-            Index_t *localNode = domain.nodelist(zidx) ;
-            localNode[0] = nidx                                       ;
-            localNode[1] = nidx                                   + 1 ;
-            localNode[2] = nidx                       + edgeNodes + 1 ;
-            localNode[3] = nidx                       + edgeNodes     ;
-            localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-            ++zidx ;
-            ++nidx ;
-         }
-         ++nidx ;
-      }
-      nidx += edgeNodes ;
-   }
-
-   domain.AllocateNodeElemIndexes() ;
-
-   /* Create a material IndexSet (entire domain same material for now) */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.matElemlist(i) = i ;
-   }
-   
-   /* initialize material parameters */
-   domain.dtfixed() = Real_t(-1.0e-7) ;
-   domain.deltatime() = Real_t(1.0e-7) ;
-   domain.deltatimemultlb() = Real_t(1.1) ;
-   domain.deltatimemultub() = Real_t(1.2) ;
-   domain.stoptime()  = Real_t(1.0e-2) ;
-   domain.dtcourant() = Real_t(1.0e+20) ;
-   domain.dthydro()   = Real_t(1.0e+20) ;
-   domain.dtmax()     = Real_t(1.0e-2) ;
-   domain.time()    = Real_t(0.) ;
-   domain.cycle()   = 0 ;
-
-   domain.e_cut() = Real_t(1.0e-7) ;
-   domain.p_cut() = Real_t(1.0e-7) ;
-   domain.q_cut() = Real_t(1.0e-7) ;
-   domain.u_cut() = Real_t(1.0e-7) ;
-   domain.v_cut() = Real_t(1.0e-10) ;
-
-   domain.hgcoef()      = Real_t(3.0) ;
-   domain.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;
-
-   domain.qstop()              =  Real_t(1.0e+12) ;
-   domain.monoq_max_slope()    =  Real_t(1.0) ;
-   domain.monoq_limiter_mult() =  Real_t(2.0) ;
-   domain.qlc_monoq()          = Real_t(0.5) ;
-   domain.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;
-   domain.qqc()                = Real_t(2.0) ;
-
-   domain.pmin() =  Real_t(0.) ;
-   domain.emin() = Real_t(-1.0e+15) ;
-
-   domain.dvovmax() =  Real_t(0.1) ;
-
-   domain.eosvmax() =  Real_t(1.0e+9) ;
-   domain.eosvmin() =  Real_t(1.0e-9) ;
-
-   domain.refdens() =  Real_t(1.0) ;
-
-   /* initialize field data */
-   for (Index_t i=0; i<domElems; ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = domain.nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = domain.x(gnode);
-        y_local[lnode] = domain.y(gnode);
-        z_local[lnode] = domain.z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      domain.volo(i) = volume ;
-      domain.elemMass(i) = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         domain.nodalMass(idx) += volume / Real_t(8.0) ;
-      }
-   }
-
-   /* deposit energy */
-   domain.e(0) = Real_t(3.948746e+7) ;
-
-   /* set up symmetry nodesets */
-   nidx = 0 ;
-   for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      Index_t rowInc   = i*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-         domain.symmX(nidx) = planeInc + j*edgeNodes ;
-         domain.symmY(nidx) = planeInc + j ;
-         domain.symmZ(nidx) = rowInc   + j ;
-         ++nidx ;
-      }
-   }
-
-   /* set up elemement connectivity information */
-   domain.lxim(0) = 0 ;
-   for (Index_t i=1; i<domElems; ++i) {
-      domain.lxim(i)   = i-1 ;
-      domain.lxip(i-1) = i ;
-   }
-   domain.lxip(domElems-1) = domElems-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      domain.letam(i) = i ; 
-      domain.letap(domElems-edgeElems+i) = domElems-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<domElems; ++i) {
-      domain.letam(i) = i-edgeElems ;
-      domain.letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      domain.lzetam(i) = i ;
-      domain.lzetap(domElems-edgeElems*edgeElems+i) = domElems-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<domElems; ++i) {
-      domain.lzetam(i) = i - edgeElems*edgeElems ;
-      domain.lzetap(i-edgeElems*edgeElems) = i ;
-   }
-
-   /* set up boundary condition information */
-   for (Index_t i=0; i<domElems; ++i) {
-      domain.elemBC(i) = 0 ;  /* clear BCs by default */
-   }
-
-   /* faces on "external" boundaries will be */
-   /* symmetry plane or free surface BCs */
-   for (Index_t i=0; i<edgeElems; ++i) {
-      Index_t planeInc = i*edgeElems*edgeElems ;
-      Index_t rowInc   = i*edgeElems ;
-      for (Index_t j=0; j<edgeElems; ++j) {
-         domain.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-         domain.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-         domain.elemBC(planeInc+j) |= ETA_M_SYMM ;
-         domain.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;
-         domain.elemBC(rowInc+j) |= ZETA_M_SYMM ;
-         domain.elemBC(rowInc+j+domElems-edgeElems*edgeElems) |= ZETA_P_FREE ;
-      }
-   }
-
-
-   /* timestep to solution */
-   timer_cycle.start("timer_cycle");
-   while(domain.time() < domain.stoptime() ) {
-      TimeIncrement() ;
-      LagrangeLeapFrog() ;
-      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */
-#if LULESH_SHOW_PROGRESS
-      printf("time = %e, dt=%e\n",
-             double(domain.time()), double(domain.deltatime()) ) ;
-#endif
-   }
-   timer_cycle.stop("timer_cycle");
-
-   timer_main.stop("timer_main");
-
-   printf("Total Cycle Time (sec) = %Lf\n", timer_cycle.elapsed() );
-   printf("Total main Time (sec) = %Lf\n", timer_main.elapsed() );
-
-
-   return 0 ;
-}
-
diff --git a/test/LULESH-v1.0/LULESH-v1.0_baseline/runme b/test/LULESH-v1.0/LULESH-v1.0_baseline/runme
deleted file mode 100755
index 3df2875e0..000000000
--- a/test/LULESH-v1.0/LULESH-v1.0_baseline/runme
+++ /dev/null
@@ -1,3 +0,0 @@
-date; 
-srun -N 1 -p pdebug --exclusive time ./lulesh-OMP.exe;
-date
diff --git a/test/LULESH-v1.0/README b/test/LULESH-v1.0/README
deleted file mode 100644
index 118ed445a..000000000
--- a/test/LULESH-v1.0/README
+++ /dev/null
@@ -1,42 +0,0 @@
-##
-## Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-## 
-## Produced at the Lawrence Livermore National Laboratory.
-## 
-## All rights reserved.
-## 
-## For release details and restrictions, please see raja/README-license.txt
-##
-
-
-#
-# The subdirectories in this directory contain different versions of the
-# LULESH 1.0 proxy app. 
-#
-# The subdirectory LULESH-v1.0_baseline contains a reference version of 
-# LULESH 1.0 that is available at https://codesign.llnl.gov/lulesh.php.
-#
-# The subdirectory LULESH-v1.0_RAJA-variants contains three RAJA variants
-# of LULESH 1.0. They are in three files: 
-#
-# 1) luleshRAJA-serial.cxx contains a serial-only version.
-# 2) luleshRAJA-parallel.cxx.src contains a version that can be run 
-#    in 10 different parallel variants using RAJA (different data layouts,
-#    different OpenMP parallelization strategies (including lock-free
-#    dependency graphs), CilkPlus, and GPU using CUDA). See luleshPolicy.hxx
-#    more more details and how to select the execution mode.
-# 3) luleshRAJA-parallel-FT.cxx contains a version similar to #2, but with a
-#    loop-level fault tolerance mechanism enabled. Also, all loops in the FT
-#    version are idempotent which is required for the fault recovery mechanism
-#    to relaunch loops. 
-#
-# When RAJA is compiled, the default variants of these examples will be 
-# generated.
-#
-# RAJA must be built with CUDA enabled to generate GPU variants.
-#
-# NOTE: When running CUDA variants of RAJA LULESH, we advise you to set the
-#       environment variable CUDA_VISIBLE_DEVICES to zero before running.
-#       We are using CUDA Unified Memory and we find that this setting 
-#       greatly improves performance.
-#
diff --git a/test/LULESH-v2.0/CMakeLists.txt b/test/LULESH-v2.0/CMakeLists.txt
deleted file mode 100644
index 975de3b06..000000000
--- a/test/LULESH-v2.0/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_subdirectory(LULESH-v2.0_baseline)
-add_subdirectory(LULESH-v2.0_RAJA-variants)
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/CMakeLists.txt b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/CMakeLists.txt
deleted file mode 100644
index 6077547a1..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_subdirectory(LULESH-v2.0_RAJA-basic)
-add_subdirectory(LULESH-v2.0_RAJA-IndexSet)
-add_subdirectory(LULESH-v2.0_RAJA-MICfriendly)
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/CMakeLists.txt b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/CMakeLists.txt
deleted file mode 100644
index ca652f61c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_definitions(-DUSE_MPI=0 -DUSE_OMP=1)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-if (RAJA_ENABLE_CUDA)
-  add_definitions(-DUSE_CASE=9)
-  cuda_add_executable(lulesh2.0_RAJA-ISet.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0_RAJA-ISet.exe RAJA ${RT_LIBRARIES})
-elseif(RAJA_ENABLE_OPENMP)
-  add_executable(lulesh2.0_RAJA-ISet.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0_RAJA-ISet.exe RAJA ${RT_LIBRARIES})
-endif()
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/README b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/README
deleted file mode 100644
index 8b0f260ba..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/README
+++ /dev/null
@@ -1,53 +0,0 @@
-This is the README for LULESH 2.0
-
-More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php
-
-If you have any questions or problems please contact:
-
-Ian Karlin <karlin1@llnl.gov>
-Jeff Keasler <keasler1@llnl.gov> or
-Rob Neely <neely4@llnl.gov>
-
-Also please send any notable results to Ian Karlin <karlin1@llnl.gov> as we are still evaluating the performance of this code.
-
-*** Notable changes in LULESH 2.0 ***
-
-Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-
-The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative:
-
-Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride
-
-Artificial load imbalances can be easily introduced that could impact parallelization strategies.  
-   * The load balance flag changes region assignment.  Region number is raised to the power entered for assignment probability.  Most likely regions changes with MPI process id.
-   * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple.  The cost of 5% is 10x the entered
- multiple.
-
-MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-
-Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt.
-
-Enabled variable timestep calculation by default (courant condition), which results in an additional reduction.  Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size.  Therefore steps to solution will differ from LULESH 1.0.
-
-Default domain (mesh) size reduced from 45^3 to 30^3
-
-Command line options to allow for numerous test cases without needing to recompile
-
-Performance optimizations and code cleanup uncovered during study of LULESH 1.0
-
-Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement
-
-Possible Future 2.0 minor updates (other changes possible as discovered)
-
-* Different default parameters
-* Minor code performance changes and cleanupS
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-comm.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-comm.cc
deleted file mode 100644
index a30c3ec1c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-comm.cc
+++ /dev/null
@@ -1,1837 +0,0 @@
-#include "lulesh.h"
-
-// If no MPI, then this whole file is stubbed out
-#if USE_MPI
-
-#include <mpi.h>
-#include <string.h>
-
-/* Comm Routines */
-
-#define ALLOW_UNPACKED_PLANE false
-#define ALLOW_UNPACKED_ROW   false
-#define ALLOW_UNPACKED_COL   false
-
-/*
-   There are coherence issues for packing and unpacking message
-   buffers.  Ideally, you would like a lot of threads to 
-   cooperate in the assembly/dissassembly of each message.
-   To do that, each thread should really be operating in a
-   different coherence zone.
-
-   Let's assume we have three fields, f1 through f3, defined on
-   a 61x61x61 cube.  If we want to send the block boundary
-   information for each field to each neighbor processor across
-   each cube face, then we have three cases for the
-   memory layout/coherence of data on each of the six cube
-   boundaries:
-
-      (a) Two of the faces will be in contiguous memory blocks
-      (b) Two of the faces will be comprised of pencils of
-          contiguous memory.
-      (c) Two of the faces will have large strides between
-          every value living on the face.
-
-   How do you pack and unpack this data in buffers to
-   simultaneous achieve the best memory efficiency and
-   the most thread independence?
-
-   Do do you pack field f1 through f3 tighly to reduce message
-   size?  Do you align each field on a cache coherence boundary
-   within the message so that threads can pack and unpack each
-   field independently?  For case (b), do you align each
-   boundary pencil of each field separately?  This increases
-   the message size, but could improve cache coherence so
-   each pencil could be processed independently by a separate
-   thread with no conflicts.
-
-   Also, memory access for case (c) would best be done without
-   going through the cache (the stride is so large it just causes
-   a lot of useless cache evictions).  Is it worth creating
-   a special case version of the packing algorithm that uses
-   non-coherent load/store opcodes?
-*/
-
-/******************************************/
-
-
-/* doRecv flag only works with regular block structure */
-void CommRecv(Domain& domain, int msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.recvRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post receives */
-
-   /* receive data from neighboring domain faces */
-   if (planeMin && doRecv) {
-      /* contiguous memory */
-      int fromRank = myRank - domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (planeMax) {
-      /* contiguous memory */
-      int fromRank = myRank + domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMin && doRecv) {
-      /* semi-contiguous memory */
-      int fromRank = myRank - domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMax) {
-      /* semi-contiguous memory */
-      int fromRank = myRank + domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMin && doRecv) {
-      /* scattered memory */
-      int fromRank = myRank - 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMax) {
-      /* scattered memory */
-      int fromRank = myRank + 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-
-   if (!planeOnly) {
-      /* receive data from domains connected only by an edge */
-      if (rowMin && colMin && doRecv) {
-         int fromRank = myRank - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax) {
-         int fromRank = myRank + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin) {
-         int fromRank = myRank + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax && doRecv) {
-         int fromRank = myRank - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      /* receive data from domains connected only by a corner */
-      if (rowMin && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-}
-
-/******************************************/
-
-void CommSend(Domain& domain, int msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
-{
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   MPI_Status status[26] ;
-   Real_t *destAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.sendRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post sends */
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dy ;
-
-      if (planeMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (planeMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dz ;
-
-      if (rowMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (rowMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dy * dz ;
-
-      if (colMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (colMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-
-   if (!planeOnly) {
-      if (rowMin && colMin) {
-         int toRank = myRank - domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax && doSend) {
-         int toRank = myRank + domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-              destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin && doSend) {
-         int toRank = myRank + domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax) {
-         int toRank = myRank - domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMin && planeMin) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(0) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*dz - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-
-   MPI_Waitall(26, domain.sendRequest, status) ;
-}
-
-/******************************************/
-
-void CommSBN(Domain& domain, int xferFields, Domain_member *fieldData) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* summation order should be from smallest value to largest */
-   /* or we could try out kahan summation! */
-
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
-   if (domain.rowLoc() == 0) {
-      rowMin = 0 ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = 0 ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = 0 ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = 0 ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = 0 ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = 0 ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMin & planeMin) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMin & planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMin) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMin) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMin) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommSyncPosVel(Domain& domain) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   bool doRecv = false ;
-   Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
-   Domain_member fieldData[6] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin && colMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && colMax && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-
-   if (rowMin && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMin && planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommMonoQ(Domain& domain)
-{
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
-   Domain_member fieldData[3] ;
-   Index_t fieldOffset[3] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t dx = domain.sizeX() ;
-   Index_t dy = domain.sizeY() ;
-   Index_t dz = domain.sizeZ() ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   /* point into ghost data area */
-   // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
-   // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
-   // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
-   fieldData[0] = &Domain::delv_xi ;
-   fieldData[1] = &Domain::delv_eta ;
-   fieldData[2] = &Domain::delv_zeta ;
-   fieldOffset[0] = domain.numElem() ;
-   fieldOffset[1] = domain.numElem() ;
-   fieldOffset[2] = domain.numElem() ;
-
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-}
-
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-init.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-init.cc
deleted file mode 100644
index fd33bcc33..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-init.cc
+++ /dev/null
@@ -1,884 +0,0 @@
-#include <math.h>
-#if USE_MPI
-# include <mpi.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <cstdlib>
-#include "lulesh.h"
-
-/////////////////////////////////////////////////////////////////////
-Domain::Domain(Int_t numRanks, Index_t colLoc,
-               Index_t rowLoc, Index_t planeLoc,
-               Index_t nx, int tp, int nr, int balance, Int_t cost)
-   :
-   m_e_cut(Real_t(1.0e-7)),
-   m_p_cut(Real_t(1.0e-7)),
-   m_q_cut(Real_t(1.0e-7)),
-   m_v_cut(Real_t(1.0e-10)),
-   m_u_cut(Real_t(1.0e-7)),
-   m_hgcoef(Real_t(3.0)),
-   m_ss4o3(Real_t(4.0)/Real_t(3.0)),
-   m_qstop(Real_t(1.0e+12)),
-   m_monoq_max_slope(Real_t(1.0)),
-   m_monoq_limiter_mult(Real_t(2.0)),
-   m_qlc_monoq(Real_t(0.5)),
-   m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
-   m_qqc(Real_t(2.0)),
-   m_eosvmax(Real_t(1.0e+9)),
-   m_eosvmin(Real_t(1.0e-9)),
-   m_pmin(Real_t(0.)),
-   m_emin(Real_t(-1.0e+15)),
-   m_dvovmax(Real_t(0.1)),
-   m_refdens(Real_t(1.0)),
-//
-// set pointers to (potentially) "new'd" arrays to null to 
-// simplify deallocation.
-//
-   m_perm(0),
-   m_regNumList(0),
-#if defined(OMP_FINE_SYNC)
-   m_nodeElemStart(0),
-   m_nodeElemCornerList(0),
-#endif
-   m_regElemSize(0),
-   m_regElemlist(0)
-#if USE_MPI
-   , 
-   commDataSend(0),
-   commDataRecv(0)
-#endif
-{
-
-   Index_t edgeElems = nx ;
-   Index_t edgeNodes = edgeElems+1 ;
-   this->cost() = cost;
-
-   m_tp       = tp ;
-   m_numRanks = numRanks ;
-
-   ///////////////////////////////
-   //   Initialize Sedov Mesh
-   ///////////////////////////////
-
-   // construct a uniform box for this processor
-
-   m_colLoc   =   colLoc ;
-   m_rowLoc   =   rowLoc ;
-   m_planeLoc = planeLoc ;
-   
-   m_sizeX = edgeElems ;
-   m_sizeY = edgeElems ;
-   m_sizeZ = edgeElems ;
-   m_numElem = edgeElems*edgeElems*edgeElems ;
-
-   m_numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   m_regNumList = new Index_t[numElem()] ;  // material indexset
-
-#if !defined(LULESH_LIST_INDEXSET)
-   m_perm = new Index_t[numElem()] ;
-#endif
-   // Elem-centered 
-   AllocateElemPersistent(numElem()) ;
-
-   // Node-centered 
-   AllocateNodePersistent(numNode()) ;
-
-   SetupCommBuffers(edgeNodes);
-
-   BuildMeshTopology(edgeNodes, edgeElems);
-
-   BuildMeshCoordinates(nx, edgeNodes);
-
-   // Setup index sets for nodes and elems 
-   CreateMeshIndexSets();
-
-   // Setup symmetry nodesets
-   CreateSymmetryIndexSets(edgeNodes);
-
-   // Setup element connectivities
-   SetupElementConnectivities(edgeElems);
-
-   // Setup symmetry planes and free surface boundary arrays
-   SetupBoundaryConditions(edgeElems);
-
-   // Setup region index sets. For now, these are constant sized
-   // throughout the run, but could be changed every cycle to 
-   // simulate effects of ALE on the lagrange solver
-   CreateRegionIndexSets(nr, balance);
-
-   /* find element zero index */
-   Index_t initEnergyElemIdx = 0 ;
-
-   /* assign each material to a contiguous range of elements */
-   if ((m_perm != 0) && (nr != 1)) {
-      /* permute nodelist connectivity */
-      {
-         Index_t *tmp = new Index_t[8*numElem()] ;
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            Index_t *localNode = nodelist(perm(i)) ;
-            for (Index_t j=0; j<8; ++j) {
-               tmp[i*8+j] = localNode[j] ;
-            }
-         } // ) ;
-         memcpy(nodelist(0), tmp, 8*sizeof(Index_t)*numElem()) ;
-         delete [] tmp ;
-      }
-
-      /* permute lxim, lxip, letam, letap, lzetam, lzetap */
-      {
-         Index_t *tmp = new Index_t[6*numElem()] ;
-         Index_t *iperm = new Index_t[numElem()] ; /* inverse permutation */
-
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            iperm[perm(i)] = i ;
-         } // ) ;
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            tmp[i*6+0] = iperm[lxim(perm(i))] ;
-            tmp[i*6+1] = iperm[lxip(perm(i))] ;
-            tmp[i*6+2] = iperm[letam(perm(i))] ;
-            tmp[i*6+3] = iperm[letap(perm(i))] ;
-            tmp[i*6+4] = iperm[lzetam(perm(i))] ;
-            tmp[i*6+5] = iperm[lzetap(perm(i))] ;
-         } // ) ;
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            lxim(i) = tmp[i*6+0] ;
-            lxip(i) = tmp[i*6+1] ;
-            letam(i) = tmp[i*6+2] ;
-            letap(i) = tmp[i*6+3] ;
-            lzetam(i) = tmp[i*6+4] ;
-            lzetap(i) = tmp[i*6+5] ;
-         } // ) ;
-
-         initEnergyElemIdx = iperm[0] ;
-
-         delete [] iperm ;
-         delete [] tmp ;
-      }
-      /* permute elemBC */
-      {
-         Int_t *tmp = new Int_t[numElem()] ;
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            tmp[i] = elemBC(perm(i)) ;
-         } // ) ;
-         // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-         for (Index_t i=0; i<numElem(); ++i) {
-            elemBC(i) = tmp[i] ;
-         } // ) ;
-         delete [] tmp ;
-      }
-   }
-
-   // Basic Field Initialization 
-   // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numElem(); ++i) {
-      e(i) =  Real_t(0.0) ;
-      p(i) =  Real_t(0.0) ;
-      q(i) =  Real_t(0.0) ;
-      ss(i) = Real_t(0.0) ;
-   } // ) ;
-
-   // Note - v initializes to 1.0, not 0.0!
-   // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numElem(); ++i) {
-      v(i) = Real_t(1.0) ;
-   } // ) ;
-
-   // RAJA::forall<node_exec_policy>(getNodeISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numNode(); ++i) {
-      xd(i) = Real_t(0.0) ;
-      yd(i) = Real_t(0.0) ;
-      zd(i) = Real_t(0.0) ;
-   } // ) ;
-
-   // RAJA::forall<node_exec_policy>(getNodeISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numNode(); ++i) {
-      xdd(i) = Real_t(0.0) ;
-      ydd(i) = Real_t(0.0) ;
-      zdd(i) = Real_t(0.0) ;
-   } // ) ;
-
-   // RAJA::forall<node_exec_policy>(getNodeISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numNode(); ++i) {
-      nodalMass(i) = Real_t(0.0) ;
-   } // ) ;
-
-#if defined(OMP_FINE_SYNC)
-   SetupThreadSupportStructures();
-#endif
-
-
-   // Setup defaults
-
-   // These can be changed (requires recompile) if you want to run
-   // with a fixed timestep, or to a different end time, but it's
-   // probably easier/better to just run a fixed number of timesteps
-   // using the -i flag in 2.x
-
-   dtfixed() = Real_t(-1.0e-6) ; // Negative means use courant condition
-   stoptime()  = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ;
-
-   // Initial conditions
-   deltatimemultlb() = Real_t(1.1) ;
-   deltatimemultub() = Real_t(1.2) ;
-   dtcourant() = Real_t(1.0e+20) ;
-   dthydro()   = Real_t(1.0e+20) ;
-   dtmax()     = Real_t(1.0e-2) ;
-   time()    = Real_t(0.) ;
-   cycle()   = Int_t(0) ;
-
-   // initialize field data 
-   // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numElem(); ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = x(gnode);
-        y_local[lnode] = y(gnode);
-        z_local[lnode] = z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      volo(i) = volume ;
-      elemMass(i) = volume ;
-   } // ) ;
-
-   /* RAJA is not thread-safe here -- address when more policies defined */
-   // RAJA::forall<elem_exec_policy>(getElemISet(), [=] RAJA_DEVICE (int i) {
-   for (Index_t i=0; i<numElem(); ++i) {
-      Index_t *elemToNode = nodelist(i) ;
-      Real_t cornerMass = elemMass(i) / Real_t(8.0) ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         nodalMass(idx) += cornerMass ;
-      }
-   } // ) ;
-
-   // deposit initial energy
-   // An energy of 3.948746e+7 is correct for a problem with
-   // 45 zones along a side - we need to scale it
-   const Real_t ebase = Real_t(3.948746e+7);
-   Real_t scale = (nx*m_tp)/Real_t(45.0);
-   Real_t einit = ebase*scale*scale*scale;
-   if (m_rowLoc + m_colLoc + m_planeLoc == 0) {
-      // Dump into the first zone (which we know is in the corner)
-      // of the domain that sits at the origin
-      e(initEnergyElemIdx) = einit;
-   }
-   //set initial deltatime base on analytic CFL calculation
-   deltatime() = (Real_t(.5)*cbrt(volo(0)))/sqrt(Real_t(2.0)*einit);
-
-} // End constructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-Domain::~Domain()
-{
-   delete [] m_regNumList;
-#if defined(OMP_FINE_SYNC)
-   Release(&m_nodeElemStart) ;
-   Release(&m_nodeElemCornerList) ;
-#endif
-   delete [] m_regElemSize;
-   if (numReg() != 1) {
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-        delete [] m_regElemlist[i];
-      }
-   }
-   delete [] m_regElemlist;
-   
-   if (m_perm != 0) {
-      delete [] m_perm ;
-   }
-#if USE_MPI
-   delete [] commDataSend;
-   delete [] commDataRecv;
-#endif
-} // End destructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems)
-{
-  // embed hexehedral elements in nodal point lattice 
-  Index_t zidx = 0 ;
-  Index_t nidx = 0 ;
-  for (Index_t plane=0; plane<edgeElems; ++plane) {
-    for (Index_t row=0; row<edgeElems; ++row) {
-      for (Index_t col=0; col<edgeElems; ++col) {
-        Index_t *localNode = nodelist(zidx) ;
-        localNode[0] = nidx                                       ;
-        localNode[1] = nidx                                   + 1 ;
-        localNode[2] = nidx                       + edgeNodes + 1 ;
-        localNode[3] = nidx                       + edgeNodes     ;
-        localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-        localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-        localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-        localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-        ++zidx ;
-        ++nidx ;
-      }
-      ++nidx ;
-    }
-    nidx += edgeNodes ;
-  }
-}
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMeshCoordinates(Index_t nx, Index_t edgeNodes)
-{
-  Index_t meshEdgeElems = m_tp*nx ;
-
-  // initialize nodal coordinates 
-  Index_t nidx = 0 ;
-  Real_t tz = Real_t(1.125)*Real_t(m_planeLoc*nx)/Real_t(meshEdgeElems) ;
-  for (Index_t plane=0; plane<edgeNodes; ++plane) {
-    Real_t ty = Real_t(1.125)*Real_t(m_rowLoc*nx)/Real_t(meshEdgeElems) ;
-    for (Index_t row=0; row<edgeNodes; ++row) {
-      Real_t tx = Real_t(1.125)*Real_t(m_colLoc*nx)/Real_t(meshEdgeElems) ;
-      for (Index_t col=0; col<edgeNodes; ++col) {
-        x(nidx) = tx ;
-        y(nidx) = ty ;
-        z(nidx) = tz ;
-        ++nidx ;
-        // tx += ds ; // may accumulate roundoff... 
-        tx = Real_t(1.125)*Real_t(m_colLoc*nx+col+1)/Real_t(meshEdgeElems) ;
-      }
-      // ty += ds ;  // may accumulate roundoff... 
-      ty = Real_t(1.125)*Real_t(m_rowLoc*nx+row+1)/Real_t(meshEdgeElems) ;
-    }
-    // tz += ds ;  // may accumulate roundoff... 
-    tz = Real_t(1.125)*Real_t(m_planeLoc*nx+plane+1)/Real_t(meshEdgeElems) ;
-  }
-
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-#if defined(OMP_FINE_SYNC)
-void
-Domain::SetupThreadSupportStructures()
-{
-  // set up node-centered indexing of elements 
-  Index_t *nodeElemCount = new Index_t[numNode()] ;
-
-  for (Index_t i=0; i<numNode(); ++i) {
-    nodeElemCount[i] = 0 ;
-  }
-
-  for (Index_t i=0; i<numElem(); ++i) {
-    Index_t *nl = nodelist(i) ;
-    for (Index_t j=0; j < 8; ++j) {
-      ++(nodeElemCount[nl[j]] );
-    }
-  }
-
-  m_nodeElemStart = Allocate<Index_t>(numNode()+1) ;
-
-  m_nodeElemStart[0] = 0;
-
-  for (Index_t i=1; i <= numNode(); ++i) {
-    m_nodeElemStart[i] =
-      m_nodeElemStart[i-1] + nodeElemCount[i-1] ;
-  }
-       
-  m_nodeElemCornerList = Allocate<Index_t>(m_nodeElemStart[numNode()]);
-
-  for (Index_t i=0; i < numNode(); ++i) {
-    nodeElemCount[i] = 0;
-  }
-
-  for (Index_t i=0; i < numElem(); ++i) {
-    Index_t *nl = nodelist(i) ;
-    for (Index_t j=0; j < 8; ++j) {
-      Index_t m = nl[j];
-      Index_t k = i*8 + j ;
-      Index_t offset = m_nodeElemStart[m] + nodeElemCount[m] ;
-      m_nodeElemCornerList[offset] = k;
-      ++(nodeElemCount[m]) ;
-    }
-  }
-
-  Index_t clSize = m_nodeElemStart[numNode()] ;
-  for (Index_t i=0; i < clSize; ++i) {
-    Index_t clv = m_nodeElemCornerList[i] ;
-    if ((clv < 0) || (clv > numElem()*8)) {
-      fprintf(stderr,
-              "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-#if USE_MPI
-      MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-      exit(-1);
-#endif
-    }
-  }
-
-  delete [] nodeElemCount ;
-}
-#endif
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupCommBuffers(Index_t edgeNodes)
-{
-  // allocate a buffer large enough for nodal ghost data 
-  Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ;
-  m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ;
-  m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ;
-
-  // assume communication to 6 neighbors by default 
-  m_rowMin = (m_rowLoc == 0)        ? 0 : 1;
-  m_rowMax = (m_rowLoc == m_tp-1)     ? 0 : 1;
-  m_colMin = (m_colLoc == 0)        ? 0 : 1;
-  m_colMax = (m_colLoc == m_tp-1)     ? 0 : 1;
-  m_planeMin = (m_planeLoc == 0)    ? 0 : 1;
-  m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1;
-
-#if USE_MPI   
-  // account for face communication 
-  Index_t comBufSize =
-    (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
-    m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for edge communication 
-  comBufSize +=
-    ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
-     (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
-     (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
-     (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
-    m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for corner communication 
-  // factor of 16 is so each buffer has its own cache line 
-  comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
-                 (m_rowMin & m_colMin & m_planeMax) +
-                 (m_rowMin & m_colMax & m_planeMin) +
-                 (m_rowMin & m_colMax & m_planeMax) +
-                 (m_rowMax & m_colMin & m_planeMin) +
-                 (m_rowMax & m_colMin & m_planeMax) +
-                 (m_rowMax & m_colMax & m_planeMin) +
-                 (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
-
-  this->commDataSend = new Real_t[comBufSize] ;
-  this->commDataRecv = new Real_t[comBufSize] ;
-  // prevent floating point exceptions 
-  memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ;
-  memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ;
-#endif   
-
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateMeshIndexSets()
-{
-   // leave nodes and elems in canonical ordering for now...
-   m_domNodeISet.push_back( RAJA::RangeSegment(0, numNode()) );   
-   m_domElemISet.push_back( RAJA::RangeSegment(0, numElem()) );
-}
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateRegionIndexSets(Int_t nr, Int_t balance)
-{
-#if USE_MPI   
-   Index_t myRank;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-   srand(myRank);
-#else
-   srand(0);
-   Index_t myRank = 0;
-#endif
-   this->numReg() = nr;
-   m_regElemSize = new Index_t[numReg()];
-   m_regElemlist = new Index_t*[numReg()];
-   Index_t nextIndex = 0;
-   //if we only have one region just fill it
-   // Fill out the regNumList with material numbers, which are always
-   // the region index plus one 
-   if(numReg() == 1) {
-      while (nextIndex < numElem()) {
-         this->regNumList(nextIndex) = 1;
-         nextIndex++;
-      }
-      regElemSize(0) = numElem();
-      m_domRegISet.resize(numReg());
-      m_domRegISet[0].push_back( RAJA::RangeSegment(0, regElemSize(0)) ) ;
-      m_domElemRegISet.push_back( RAJA::RangeSegment(0, regElemSize(0)) ) ;
-#if !defined(LULESH_LIST_INDEXSET)
-      for (int i=0; i<numElem(); ++i) {
-         perm(i) = i ;
-      }
-#endif
-   }
-   //If we have more than one region distribute the elements.
-   else {
-      Int_t regionNum;
-      Int_t regionVar;
-      Int_t lastReg = -1;
-      Int_t binSize;
-      Index_t elements;
-      Index_t runto = 0;
-      Int_t costDenominator = 0;
-      Int_t* regBinEnd = new Int_t[numReg()];
-      //Determine the relative weights of all the regions.  This is based off the -b flag.  Balance is the value passed into b.  
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         regElemSize(i) = 0;
-         costDenominator += pow((i+1), balance);  //Total sum of all regions weights
-         regBinEnd[i] = costDenominator;  //Chance of hitting a given region is (regBinEnd[i] - regBinEdn[i-1])/costDenominator
-      }
-      //Until all elements are assigned
-      while (nextIndex < numElem()) {
-         //pick the region
-         regionVar = rand() % costDenominator;
-         Index_t i = 0;
-         while(regionVar >= regBinEnd[i])
-            i++;
-         //rotate the regions based on MPI rank.  Rotation is Rank % NumRegions this makes each domain have a different region with 
-         //the highest representation
-         regionNum = ((i + myRank) % numReg()) + 1;
-         // make sure we don't pick the same region twice in a row
-         while(regionNum == lastReg) {
-            regionVar = rand() % costDenominator;
-            i = 0;
-            while(regionVar >= regBinEnd[i])
-               i++;
-            regionNum = ((i + myRank) % numReg()) + 1;
-         }
-         //Pick the bin size of the region and determine the number of elements.
-         binSize = rand() % 1000;
-         if(binSize < 773) {
-           elements = rand() % 15 + 1;
-         }
-         else if(binSize < 937) {
-           elements = rand() % 16 + 16;
-         }
-         else if(binSize < 970) {
-           elements = rand() % 32 + 32;
-         }
-         else if(binSize < 974) {
-           elements = rand() % 64 + 64;
-         } 
-         else if(binSize < 978) {
-           elements = rand() % 128 + 128;
-         }
-         else if(binSize < 981) {
-           elements = rand() % 256 + 256;
-         }
-         else
-            elements = rand() % 1537 + 512;
-         runto = elements + nextIndex;
-         //Store the elements.  If we hit the end before we run out of elements then just stop.
-         while (nextIndex < runto && nextIndex < numElem()) {
-            this->regNumList(nextIndex) = regionNum;
-            nextIndex++;
-         }
-         lastReg = regionNum;
-      } 
-
-      delete [] regBinEnd;
-
-      // Convert regNumList to region index sets
-      // First, count size of each region 
-      for (Index_t i=0 ; i<numElem() ; ++i) {
-         int r = this->regNumList(i)-1; // region index == regnum-1
-         regElemSize(r)++;
-      }
-      // Second, allocate each region index set
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         m_regElemlist[i] = new Index_t[regElemSize(i)];
-         regElemSize(i) = 0;
-      }
-      // Third, fill index sets
-      for (Index_t i=0 ; i<numElem() ; ++i) {
-         Index_t r = regNumList(i)-1;       // region index == regnum-1
-         Index_t regndx = regElemSize(r)++; // Note increment
-         regElemlist(r,regndx) = i;
-      }
-
-      // Create HybridISets for regions
-      m_domRegISet.resize(numReg());
-      int elemCount = 0 ;
-      for (int r = 0; r < numReg(); ++r) {
-#if !defined(LULESH_LIST_INDEXSET)
-         memcpy( &perm(elemCount), regElemlist(r), sizeof(Index_t)*regElemSize(r) ) ;
-         m_domRegISet[r].push_back( RAJA::RangeSegment(elemCount, elemCount+regElemSize(r)) );
-         m_domElemRegISet.push_back( RAJA::RangeSegment(elemCount, elemCount+regElemSize(r)) ) ;
-         elemCount += regElemSize(r) ;
-#else
-         m_domRegISet[r].push_back( RAJA::ListSegment(regElemlist(r), regElemSize(r)) );
-         m_domElemRegISet.push_back( RAJA::ListSegment(regElemlist(r), regElemSize(r)) ) ;
-#endif
-      }
-
-#if 0 // Check correctness of index sets
-      for (int r = 0; r < numReg(); ++r) {
-         bool good = true;
-         if ( regElemSize(r) != m_domRegISet[r].getLength() ) good = false;
-         if (good) {
-            Index_t* regList = regElemlist(r);
-            int i = 0; 
-            RAJA::forall< LULESH_ISET::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >(m_domRegISet[r], [=] RAJA_DEVICE (int idx) { 
-               good &= (idx == regList[i]);
-               i++;
-            } );
-         }
-         printf("\nRegion %d index set is %s\n", r, (good ? "GOOD" : "BAD")); 
-      }
-#endif
-   }
-   
-}
-
-/////////////////////////////////////////////////////////////
-void 
-Domain::CreateSymmetryIndexSets(Index_t edgeNodes)
-{
-  if (m_planeLoc == 0) {
-    m_domZSymNodeISet.push_back( RAJA::RangeSegment(0, edgeNodes*edgeNodes) );
-  }
-  if (m_rowLoc == 0) {
-    Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-    Index_t nidx = 0 ;
-    for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-        nset[nidx++] = planeInc + j ;
-      }
-    }
-    m_domYSymNodeISet.push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-    delete [] nset ;
-  }
-  if (m_colLoc == 0) {
-    Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-    Index_t nidx = 0 ;
-    for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-        nset[nidx++] = planeInc + j*edgeNodes ;
-      }
-    }
-    m_domXSymNodeISet.push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-    delete [] nset ;
-  }
-}
-
-
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupElementConnectivities(Index_t edgeElems)
-{
-   lxim(0) = 0 ;
-   for (Index_t i=1; i<numElem(); ++i) {
-      lxim(i)   = i-1 ;
-      lxip(i-1) = i ;
-   }
-   lxip(numElem()-1) = numElem()-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      letam(i) = i ; 
-      letap(numElem()-edgeElems+i) = numElem()-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<numElem(); ++i) {
-      letam(i) = i-edgeElems ;
-      letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      lzetam(i) = i ;
-      lzetap(numElem()-edgeElems*edgeElems+i) = numElem()-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<numElem(); ++i) {
-      lzetam(i) = i - edgeElems*edgeElems ;
-      lzetap(i-edgeElems*edgeElems) = i ;
-   }
-}
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupBoundaryConditions(Index_t edgeElems) 
-{
-  Index_t ghostIdx[6] ;  // offsets to ghost locations
-
-  // set up boundary condition information
-  for (Index_t i=0; i<numElem(); ++i) {
-     elemBC(i) = Int_t(0) ;
-  }
-
-  for (Index_t i=0; i<6; ++i) {
-    ghostIdx[i] = INT_MIN ;
-  }
-
-  Int_t pidx = numElem() ;
-  if (m_planeMin != 0) {
-    ghostIdx[0] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_planeMax != 0) {
-    ghostIdx[1] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_rowMin != 0) {
-    ghostIdx[2] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_rowMax != 0) {
-    ghostIdx[3] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_colMin != 0) {
-    ghostIdx[4] = pidx ;
-    pidx += sizeY()*sizeZ() ;
-  }
-
-  if (m_colMax != 0) {
-    ghostIdx[5] = pidx ;
-  }
-
-  // symmetry plane or free surface BCs 
-  for (Index_t i=0; i<edgeElems; ++i) {
-    Index_t planeInc = i*edgeElems*edgeElems ;
-    Index_t rowInc   = i*edgeElems ;
-    for (Index_t j=0; j<edgeElems; ++j) {
-      if (m_planeLoc == 0) {
-        elemBC(rowInc+j) |= ZETA_M_SYMM ;
-      }
-      else {
-        elemBC(rowInc+j) |= ZETA_M_COMM ;
-        lzetam(rowInc+j) = ghostIdx[0] + rowInc + j ;
-      }
-
-      if (m_planeLoc == m_tp-1) {
-        elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-          ZETA_P_FREE;
-      }
-      else {
-        elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-          ZETA_P_COMM ;
-        lzetap(rowInc+j+numElem()-edgeElems*edgeElems) =
-          ghostIdx[1] + rowInc + j ;
-      }
-
-      if (m_rowLoc == 0) {
-        elemBC(planeInc+j) |= ETA_M_SYMM ;
-      }
-      else {
-        elemBC(planeInc+j) |= ETA_M_COMM ;
-        letam(planeInc+j) = ghostIdx[2] + rowInc + j ;
-      }
-
-      if (m_rowLoc == m_tp-1) {
-        elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-          ETA_P_FREE ;
-      }
-      else {
-        elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-          ETA_P_COMM ;
-        letap(planeInc+j+edgeElems*edgeElems-edgeElems) =
-          ghostIdx[3] +  rowInc + j ;
-      }
-
-      if (m_colLoc == 0) {
-        elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-      }
-      else {
-        elemBC(planeInc+j*edgeElems) |= XI_M_COMM ;
-        lxim(planeInc+j*edgeElems) = ghostIdx[4] + rowInc + j ;
-      }
-
-      if (m_colLoc == m_tp-1) {
-        elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-      }
-      else {
-        elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_COMM ;
-        lxip(planeInc+j*edgeElems+edgeElems-1) =
-          ghostIdx[5] + rowInc + j ;
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side)
-{
-   Int_t testProcs;
-   Int_t dx, dy, dz;
-   Int_t myDom;
-   
-   // Assume cube processor layout for now 
-   testProcs = Int_t(cbrt(Real_t(numRanks))+0.5) ;
-   if (testProcs*testProcs*testProcs != numRanks) {
-      printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (sizeof(Real_t) != 4 && sizeof(Real_t) != 8) {
-      printf("MPI operations only support float and double right now...\n");
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) {
-      printf("corner element comm buffers too small.  Fix code.\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-
-   dx = testProcs ;
-   dy = testProcs ;
-   dz = testProcs ;
-
-   // temporary test
-   if (dx*dy*dz != numRanks) {
-      printf("error -- must have as many domains as procs\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   Int_t remainder = dx*dy*dz % numRanks ;
-   if (myRank < remainder) {
-      myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ;
-   }
-   else {
-      myDom = remainder*( 1+ (dx*dy*dz / numRanks)) +
-         (myRank - remainder)*(dx*dy*dz/numRanks) ;
-   }
-
-   *col = myDom % dx ;
-   *row = (myDom / dx) % dy ;
-   *plane = myDom / (dx*dy) ;
-   *side = testProcs;
-
-   return;
-}
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-util.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-util.cc
deleted file mode 100644
index dd00dbb5e..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-util.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#if USE_MPI
-#include <mpi.h>
-#endif
-#include "lulesh.h"
-
-/* Helper function for converting strings to ints, with error checking */
-int StrToInt(const char *token, int *retVal)
-{
-   const char *c ;
-   char *endptr ;
-   const int decimal_base = 10 ;
-
-   if (token == NULL)
-      return 0 ;
-   
-   c = token ;
-   *retVal = (int)strtol(c, &endptr, decimal_base) ;
-   if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0')))
-      return 1 ;
-   else
-      return 0 ;
-}
-
-static void PrintCommandLineOptions(char *execname, int myRank)
-{
-   if (myRank == 0) {
-
-      printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-   }
-}
-
-static void ParseError(const char *message, int myRank)
-{
-   if (myRank == 0) {
-      printf("%s\n", message);
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-      exit(-1);
-#endif
-   }
-}
-
-void ParseCommandLineOptions(int argc, char *argv[],
-                             int myRank, struct cmdLineOpts *opts)
-{
-   if(argc > 1) {
-      int i = 1;
-
-      while(i < argc) {
-         int ok;
-         /* -i <iterations> */
-         if(strcmp(argv[i], "-i") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -i", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->its));
-            if(!ok) {
-               ParseError("Parse Error on option -i integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -s <size, sidelength> */
-         else if(strcmp(argv[i], "-s") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -s\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->nx));
-            if(!ok) {
-               ParseError("Parse Error on option -s integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -r <numregions> */
-         else if (strcmp(argv[i], "-r") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -r\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numReg));
-            if (!ok) {
-               ParseError("Parse Error on option -r integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -f <numfilepieces> */
-         else if (strcmp(argv[i], "-f") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -f\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numFiles));
-            if (!ok) {
-               ParseError("Parse Error on option -f integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -p */
-         else if (strcmp(argv[i], "-p") == 0) {
-            opts->showProg = 1;
-            i++;
-         }
-         /* -q */
-         else if (strcmp(argv[i], "-q") == 0) {
-            opts->quiet = 1;
-            i++;
-         }
-         else if (strcmp(argv[i], "-b") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -b\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->balance));
-            if (!ok) {
-               ParseError("Parse Error on option -b integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         else if (strcmp(argv[i], "-c") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -c\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->cost));
-            if (!ok) {
-               ParseError("Parse Error on option -c integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -v */
-         else if (strcmp(argv[i], "-v") == 0) {
-#if VIZ_MESH            
-            opts->viz = 1;
-#else
-            ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank);
-#endif
-            i++;
-         }
-         /* -h */
-         else if (strcmp(argv[i], "-h") == 0) {
-            PrintCommandLineOptions(argv[0], myRank);
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, 0);
-#else
-            exit(0);
-#endif
-         }
-         else {
-            char msg[80];
-            PrintCommandLineOptions(argv[0], myRank);
-            sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]);
-            ParseError(msg, myRank);
-         }
-      }
-   }
-}
-
-/////////////////////////////////////////////////////////////////////
-
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks)
-{
-   // GrindTime1 only takes a single domain into account, and is thus a good way to measure
-   // processor speed indepdendent of MPI parallelism.
-   // GrindTime2 takes into account speedups from MPI parallelism 
-   Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx);
-   Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx*numRanks);
-   Real_t   maxAbsDiff = Real_t(0.0);
-   Real_t totalAbsDiff = Real_t(0.0);
-   Real_t   maxRelDiff = Real_t(0.0);
-   Index_t elemId ;
-   Index_t *iperm = new Index_t[locDom.numElem()] ;
-
-   for (Index_t i=0; i<locDom.numElem(); ++i) {
-      iperm[locDom.perm(i)] = i ;
-   }
-
-   elemId = iperm[0] ;
-
-   printf("Run completed:  \n");
-   printf("   Problem size        =  %i \n",    nx);
-   printf("   MPI tasks           =  %i \n",    numRanks);
-   printf("   Iteration count     =  %i \n",    locDom.cycle());
-   printf("   Final Origin Energy = %12.6e \n", locDom.e(elemId));
-
-
-   for (Index_t j=0; j<nx; ++j) {
-      for (Index_t k=j+1; k<nx; ++k) {
-         Real_t AbsDiff = fabs(locDom.e(iperm[j*nx+k])-locDom.e(iperm[k*nx+j]));
-         totalAbsDiff  += AbsDiff;
-
-         if (maxAbsDiff <AbsDiff) {
-            maxAbsDiff = AbsDiff;
-         }
-
-         if (locDom.e(iperm[k*nx+j]) != 0.0) {
-            Real_t RelDiff = AbsDiff / locDom.e(iperm[k*nx+j]);
-            if (maxRelDiff <RelDiff) {
-               maxRelDiff = RelDiff;
-            }
-         }
-      }
-   }
-
-   delete [] iperm ;
-
-   // Quick symmetry check
-   printf("   Testing Plane 0 of Energy Array on rank 0:\n");
-   printf("        MaxAbsDiff   = %12.6e\n",   maxAbsDiff   );
-   printf("        TotalAbsDiff = %12.6e\n",   totalAbsDiff );
-   printf("        MaxRelDiff   = %12.6e\n\n", maxRelDiff   );
-
-   // Timing information
-   printf("\nElapsed time         = %10.2f (s)\n", elapsed_time);
-   printf("Grind time (us/z/c)  = %10.8g (per dom)  (%10.8g overall)\n", grindTime1, grindTime2);
-   printf("FOM                  = %10.8g (z/s)\n\n", 1000.0/grindTime2); // zones per second
-
-   return ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-viz.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-viz.cc
deleted file mode 100644
index f0d1f36e4..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh-viz.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "lulesh.h"
-
-#if defined(VIZ_MESH)
-
-#ifdef __cplusplus
-  extern "C" {
-#endif
-#include "silo.h"
-#if USE_MPI
-# include "pmpio.h"
-#endif
-#ifdef __cplusplus
-  }
-#endif
-
-// Function prototypes
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank);
-static
-
-
-#if USE_MPI
-// For some reason, earlier versions of g++ (e.g. 4.2) won't let me
-// put the 'static' qualifier on this prototype, even if it's done
-// consistently in the prototype and definition
-void
-DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                      char basename[], int numRanks);
-
-// Callback prototypes for PMPIO interface (only useful if we're
-// running parallel)
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata);
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata);
-static void
-LULESH_PMPIO_Close(void *file, void *udata);
-
-#else
-void
-DumpMultiblockObjects(DBfile *db, char basename[], int numRanks);
-#endif
-
-
-/**********************************************************************/
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 
-{
-  char subdirName[32];
-  char basename[32];
-  DBfile *db;
-
-
-  sprintf(basename, "lulesh_plot_c%d", domain.cycle());
-  sprintf(subdirName, "data_%d", myRank);
-
-#if USE_MPI
-
-  PMPIO_baton_t *bat = PMPIO_Init(numFiles,
-				  PMPIO_WRITE,
-				  MPI_COMM_WORLD,
-				  10101,
-				  LULESH_PMPIO_Create,
-				  LULESH_PMPIO_Open,
-				  LULESH_PMPIO_Close,
-				  NULL);
-
-  int myiorank = PMPIO_GroupRank(bat, myRank);
-
-  char fileName[64];
-  
-  if (myiorank == 0) 
-    strcpy(fileName, basename);
-  else
-    sprintf(fileName, "%s.%03d", basename, myiorank);
-
-  db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName);
-
-  DumpDomainToVisit(db, domain, myRank);
-
-  // Processor 0 writes out bit of extra data to its file that
-  // describes how to stitch all the pieces together
-  if (myRank == 0) {
-    DumpMultiblockObjects(db, bat, basename, numRanks);
-  }
-
-  PMPIO_HandOffBaton(bat, db);
-
-  PMPIO_Finish(bat);
-#else
-
-  db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-  if (db) {
-     DBMkDir(db, subdirName);
-     DBSetDir(db, subdirName);
-     DumpDomainToVisit(db, domain, myRank);
-     DumpMultiblockObjects(db, basename, numRanks);
-  }
-  else {
-     printf("Error writing out viz file - rank %d\n", myRank);
-  }
-
-#endif
-}
-
-
-
-/**********************************************************************/
-
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank)
-{
-   int ok = 0;
-   
-   /* Create an option list that will give some hints to VisIt for
-    * printing out the cycle and time in the annotations */
-   DBoptlist *optlist;
-
-
-   /* Write out the mesh connectivity in fully unstructured format */
-   int shapetype[1] = {DB_ZONETYPE_HEX};
-   int shapesize[1] = {8};
-   int shapecnt[1] = {domain.numElem()};
-   int *conn = new int[domain.numElem()*8] ;
-   int ci = 0 ;
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      Index_t *elemToNode = domain.nodelist(ei) ;
-      for (int ni=0; ni < 8; ++ni) {
-         conn[ci++] = elemToNode[ni] ;
-      }
-   }
-   ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3,
-                        conn, domain.numElem()*8,
-                        0,0,0, /* Not carrying ghost zones */
-                        shapetype, shapesize, shapecnt,
-                        1, NULL);
-   delete [] conn ;
-
-   /* Write out the mesh coordinates associated with the mesh */
-   const char* coordnames[3] = {"X", "Y", "Z"};
-   float *coords[3] ;
-   coords[0] = new float[domain.numNode()] ;
-   coords[1] = new float[domain.numNode()] ;
-   coords[2] = new float[domain.numNode()] ;
-   for (int ni=0; ni < domain.numNode() ; ++ni) {
-      coords[0][ni] = float(domain.x(ni)) ;
-      coords[1][ni] = float(domain.y(ni)) ;
-      coords[2][ni] = float(domain.z(ni)) ;
-   }
-   optlist = DBMakeOptlist(2);
-   ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time());
-   ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle());
-   ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords,
-                      domain.numNode(), domain.numElem(), "connectivity",
-                      0, DB_FLOAT, optlist);
-   ok += DBFreeOptlist(optlist);
-   delete [] coords[2] ;
-   delete [] coords[1] ;
-   delete [] coords[0] ;
-
-   /* Write out the materials */
-   int *matnums = new int[domain.numReg()];
-   int dims[1] = {domain.numElem()}; // No mixed elements
-   for(int i=0 ; i<domain.numReg() ; ++i)
-      matnums[i] = i+1;
-   
-   ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(),
-                       matnums, domain.regNumList(), dims, 1,
-                       NULL, NULL, NULL, NULL, 0, DB_FLOAT, NULL);
-   delete [] matnums;
-
-   /* Write out pressure, energy, relvol, q */
-
-   float *e = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      e[ei] = float(domain.e(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "e", "mesh", e,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] e ;
-
-
-   float *p = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      p[ei] = float(domain.p(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "p", "mesh", p,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] p ;
-
-   float *v = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      v[ei] = float(domain.v(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "v", "mesh", v,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] v ;
-
-   float *q = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      q[ei] = float(domain.q(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "q", "mesh", q,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] q ;
-
-   /* Write out nodal speed, velocities */
-   float *zd    = new float[domain.numNode()];
-   float *yd    = new float[domain.numNode()];
-   float *xd    = new float[domain.numNode()];
-   float *speed = new float[domain.numNode()];
-   for(int ni=0 ; ni < domain.numNode() ; ++ni) {
-      xd[ni]    = float(domain.xd(ni));
-      yd[ni]    = float(domain.yd(ni));
-      zd[ni]    = float(domain.zd(ni));
-      speed[ni] = float(sqrt((xd[ni]*xd[ni])+(yd[ni]*yd[ni])+(zd[ni]*zd[ni])));
-   }
-
-   ok += DBPutUcdvar1(db, "speed", "mesh", speed,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] speed;
-
-
-   ok += DBPutUcdvar1(db, "xd", "mesh", xd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] xd ;
-
-   ok += DBPutUcdvar1(db, "yd", "mesh", yd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] yd ;
-
-   ok += DBPutUcdvar1(db, "zd", "mesh", zd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] zd ;
-
-
-   if (ok != 0) {
-      printf("Error writing out viz file - rank %d\n", myRank);
-   }
-}
-
-/**********************************************************************/
-
-#if USE_MPI     
-void
-   DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                         char basename[], int numRanks)
-#else
-void
-  DumpMultiblockObjects(DBfile *db, char basename[], int numRanks)
-#endif
-{
-   /* MULTIBLOCK objects to tie together multiple files */
-  char **multimeshObjs;
-  char **multimatObjs;
-  char ***multivarObjs;
-  int *blockTypes;
-  int *varTypes;
-  int ok = 0;
-  // Make sure this list matches what's written out above
-  char vars[][10] = {"p","e","v","q", "speed", "xd", "yd", "zd"};
-  int numvars = sizeof(vars)/sizeof(vars[0]);
-
-  // Reset to the root directory of the silo file
-  DBSetDir(db, "/");
-
-  // Allocate a bunch of space for building up the string names
-  multimeshObjs = new char*[numRanks];
-  multimatObjs = new char*[numRanks];
-  multivarObjs = new char**[numvars];
-  blockTypes = new int[numRanks];
-  varTypes = new int[numRanks];
-
-  for(int v=0 ; v<numvars ; ++v) {
-     multivarObjs[v] = new char*[numRanks];
-  }
-  
-  for(int i=0 ; i<numRanks ; ++i) {
-     multimeshObjs[i] = new char[64];
-     multimatObjs[i] = new char[64];
-     for(int v=0 ; v<numvars ; ++v) {
-        multivarObjs[v][i] = new char[64];
-     }
-     blockTypes[i] = DB_UCDMESH;
-     varTypes[i] = DB_UCDVAR;
-  }
-      
-  // Build up the multiobject names
-  for(int i=0 ; i<numRanks ; ++i) {
-#if USE_MPI     
-    int iorank = PMPIO_GroupRank(bat, i);
-#else
-    int iorank = 0;
-#endif
-
-    //delete multivarObjs[i];
-    if (iorank == 0) {
-      snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i);
-      snprintf(multimatObjs[i], 64, "/data_%d/regions",i);
-      for(int v=0 ; v<numvars ; ++v) {
-	snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]);
-      }
-     
-    }
-    else {
-      snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh",
-               basename, iorank, i);
-      snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", 
-	       basename, iorank, i);
-      for(int v=0 ; v<numvars ; ++v) {
-         snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", 
-                  basename, iorank, i, vars[v]);
-      }
-    }
-  }
-
-  // Now write out the objects
-  ok += DBPutMultimesh(db, "mesh", numRanks,
-		       (char**)multimeshObjs, blockTypes, NULL);
-  ok += DBPutMultimat(db, "regions", numRanks,
-		      (char**)multimatObjs, NULL);
-  for(int v=0 ; v<numvars ; ++v) {
-     ok += DBPutMultivar(db, vars[v], numRanks,
-                         (char**)multivarObjs[v], varTypes, NULL);
-  }
-
-  for(int v=0; v < numvars; ++v) {
-    for(int i = 0; i < numRanks; i++) {
-      delete multivarObjs[v][i];
-    }
-    delete multivarObjs[v];
-  }
-
-  // Clean up
-  for(int i=0 ; i<numRanks ; i++) {
-    delete multimeshObjs[i];
-    delete multimatObjs[i];
-  }
-  delete [] multimeshObjs;
-  delete [] multimatObjs;
-  delete [] multivarObjs;
-  delete [] blockTypes;
-  delete [] varTypes;
-
-  if (ok != 0) {
-    printf("Error writing out multiXXX objs to viz file - rank 0\n");
-  }
-}
-
-# if USE_MPI
-
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata)
-{
-   /* Create the file */
-   DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata)
-{
-   /* Open the file */
-  DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void
-LULESH_PMPIO_Close(void *file, void *udata)
-{
-  DBfile *db = (DBfile*)file;
-  if (db)
-    DBClose(db);
-}
-# endif
-
-   
-#else
-
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks)
-{
-   if (myRank == 0) {
-      printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n");
-   }
-}
-
-#endif
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.cc
deleted file mode 100644
index 1c5490ee6..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.cc
+++ /dev/null
@@ -1,2639 +0,0 @@
-/*
-  This is a Version 2.0 MPI + OpenMP implementation of LULESH
-
-                 Copyright (c) 2010-2013.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 2.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-//////////////
-DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
-* Addition of regions to make work more representative of multi-material codes
-* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
-  more representative of our actual working set sizes
-* Single source distribution supports pure serial, pure OpenMP, MPI-only, 
-  and MPI+OpenMP
-* Addition of ability to visualize the mesh using VisIt 
-  https://wci.llnl.gov/codes/visit/download.html
-* Various command line options (see ./lulesh2.0 -h)
- -q              : quiet mode - suppress stdout
- -i <iterations> : number of cycles to run
- -s <size>       : length of cube mesh along side
- -r <numregions> : Number of distinct regions (def: 11)
- -b <balance>    : Load balance between regions of a domain (def: 1)
- -c <cost>       : Extra cost of more expensive regions (def: 1)
- -f <filepieces> : Number of file parts for viz output (def: np/9)
- -p              : Print out progress
- -v              : Output viz file (requires compiling with -DVIZ_MESH
- -h              : This message
-
- printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-
-*Notable changes in LULESH 2.0
-
-* Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-*
-* The concept of "regions" was added, although every region is the same ideal
-*    gas material, and the same sedov blast wave problem is still the only
-*    problem its hardcoded to solve.
-* Regions allow two things important to making this proxy app more representative:
-*   Four of the LULESH routines are now performed on a region-by-region basis,
-*     making the memory access patterns non-unit stride
-*   Artificial load imbalances can be easily introduced that could impact
-*     parallelization strategies.  
-* The load balance flag changes region assignment.  Region number is raised to
-*   the power entered for assignment probability.  Most likely regions changes
-*   with MPI process id.
-* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
-*   entered multiple. The cost of 5% is 10x the entered multiple.
-* MPI and OpenMP were added, and coalesced into a single version of the source
-*   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-* Added support to write plot files using "poor mans parallel I/O" when linked
-*   with the silo library, which in turn can be read by VisIt.
-* Enabled variable timestep calculation by default (courant condition), which
-*   results in an additional reduction.
-* Default domain (mesh) size reduced from 45^3 to 30^3
-* Command line options to allow numerous test cases without needing to recompile
-* Performance optimizations and code cleanup beyond LULESH 1.0
-* Added a "Figure of Merit" calculation (elements solved per microsecond) and
-*   output in support of using LULESH 2.0 for the 2017 CORAL procurement
-*
-* Possible Differences in Final Release (other changes possible)
-*
-* High Level mesh structure to allow data structure transformations
-* Different default parameters
-* Minor code performance changes and cleanup
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
-//////////////
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <climits>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
-#include <unistd.h>
-
-#include "lulesh.h"
-#include "Timer.hxx"
-
-#define RAJA_STORAGE static inline
-//#define RAJA_STORAGE 
-
-/* Manage temporary allocations with a pool */
-RAJA::MemoryPool< Real_t > elemMemPool ;
-
-/******************************************/
-
-/* Work Routines */
-
-RAJA_STORAGE
-void TimeIncrement(Domain& domain)
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t gnewdt = Real_t(1.0e+20) ;
-      Real_t newdt ;
-      if (domain.dtcourant() < gnewdt) {
-         gnewdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < gnewdt) {
-         gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-#if USE_MPI      
-      MPI_Allreduce(&gnewdt, &newdt, 1,
-                    ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
-#else
-      newdt = gnewdt;
-#endif
-      
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Domain* domain,
-                                   const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain->x(nd0i);
-   elemX[1] = domain->x(nd1i);
-   elemX[2] = domain->x(nd2i);
-   elemX[3] = domain->x(nd3i);
-   elemX[4] = domain->x(nd4i);
-   elemX[5] = domain->x(nd5i);
-   elemX[6] = domain->x(nd6i);
-   elemX[7] = domain->x(nd7i);
-
-   elemY[0] = domain->y(nd0i);
-   elemY[1] = domain->y(nd1i);
-   elemY[2] = domain->y(nd2i);
-   elemY[3] = domain->y(nd3i);
-   elemY[4] = domain->y(nd4i);
-   elemY[5] = domain->y(nd5i);
-   elemY[6] = domain->y(nd6i);
-   elemY[7] = domain->y(nd7i);
-
-   elemZ[0] = domain->z(nd0i);
-   elemZ[1] = domain->z(nd1i);
-   elemZ[2] = domain->z(nd2i);
-   elemZ[3] = domain->z(nd3i);
-   elemZ[4] = domain->z(nd4i);
-   elemZ[5] = domain->z(nd5i);
-   elemZ[6] = domain->z(nd6i);
-   elemZ[7] = domain->z(nd7i);
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void InitStressTermsForElems(Domain* domain,
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-      [=] RAJA_DEVICE (int i) {
-      sigxx[i] = sigyy[i] = sigzz[i] =  - domain->p(i) - domain->q(i) ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemShapeFunctionDerivatives( Real_t const x[],
-                                       Real_t const y[],
-                                       Real_t const z[],
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* fx, Real_t* fy, Real_t* fz )
-{
-   for(Index_t i = 0; i < 8; i++) {
-      fx[i] = -( stress_xx * B[0][i] );
-      fy[i] = -( stress_yy * B[1][i]  );
-      fz[i] = -( stress_zz * B[2][i] );
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void IntegrateStressForElems( Domain* domain,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ, Index_t numElem)
-{
-#if defined(OMP_FINE_SYNC)
-  Real_t *fx_elem = elemMemPool.allocate(numElem*8) ;
-  Real_t *fy_elem = elemMemPool.allocate(numElem*8) ;
-  Real_t *fz_elem = elemMemPool.allocate(numElem*8) ;
-#endif
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-     [=] RAJA_DEVICE (int k) {
-    const Index_t* const elemToNode = domain->nodelist(k);
-    Real_t B[3][8] __attribute__((aligned(32))) ;// shape function derivatives
-    Real_t x_local[8] __attribute__((aligned(32))) ;
-    Real_t y_local[8] __attribute__((aligned(32))) ;
-    Real_t z_local[8] __attribute__((aligned(32))) ;
-#if !defined(OMP_FINE_SYNC)
-    Real_t fx_local[8] __attribute__((aligned(32))) ;
-    Real_t fy_local[8] __attribute__((aligned(32))) ;
-    Real_t fz_local[8] __attribute__((aligned(32))) ;
-#endif
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode,
-                                  x_local, y_local, z_local);
-
-    // Volume calculation involves extra work for numerical consistency
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-#if !defined(OMP_FINE_SYNC)
-                                 fx_local, fy_local, fz_local
-#else
-                                 &fx_elem[k*8], &fy_elem[k*8], &fz_elem[k*8]
-#endif
-                       ) ;
-
-#if !defined(OMP_FINE_SYNC)
-    // copy nodal force contributions to global force arrray.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
-       Index_t gnode = elemToNode[lnode];
-       domain->fx(gnode) += fx_local[lnode];
-       domain->fy(gnode) += fy_local[lnode];
-       domain->fz(gnode) += fz_local[lnode];
-    }
-#endif
-  } );
-
-#if defined(OMP_FINE_SYNC)
-  RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-                                 [=] RAJA_DEVICE (int gnode) {
-     Index_t count = domain->nodeElemCount(gnode) ;
-     Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-     Real_t fx_sum = Real_t(0.0) ;
-     Real_t fy_sum = Real_t(0.0) ;
-     Real_t fz_sum = Real_t(0.0) ;
-     for (Index_t i=0 ; i < count ; ++i) {
-        Index_t ielem = cornerList[i] ;
-        fx_sum += fx_elem[ielem] ;
-        fy_sum += fy_elem[ielem] ;
-        fz_sum += fz_elem[ielem] ;
-     }
-     domain->fx(gnode) = fx_sum ;
-     domain->fy(gnode) = fy_sum ;
-     domain->fz(gnode) = fz_sum ;
-  } );
-
-  elemMemPool.release(&fz_elem) ;
-  elemMemPool.release(&fy_elem) ;
-  elemMemPool.release(&fx_elem) ;
-#endif
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
-                              Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Real_t hxx[4];
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
-               hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
-               hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
-               hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfx[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
-               hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
-               hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
-               hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfy[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
-               hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
-               hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
-               hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfz[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Domain* domain,
-                                   Real_t *determ,
-                                   Real_t *x8n, Real_t *y8n, Real_t *z8n,
-                                   Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
-                                   Real_t hourg, Index_t numElem)
-{
-  /*************************************************
-   *
-   *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-   *               force.
-   *
-   *************************************************/
-  
-#if defined(OMP_FINE_SYNC)
-   Real_t *fx_elem = elemMemPool.allocate(numElem*8) ;
-   Real_t *fy_elem = elemMemPool.allocate(numElem*8) ;
-   Real_t *fz_elem = elemMemPool.allocate(numElem*8) ;
-#endif
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-      [=] RAJA_DEVICE (int i2) {
-
-#if !defined(OMP_FINE_SYNC)
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-#endif
-
-      Real_t coefficient;
-
-      Real_t hourgam[8][4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      // Define this here so code works on both host and device
-      const Real_t gamma[4][8] =
-      {
-        { Real_t( 1.), Real_t( 1.), Real_t(-1.), Real_t(-1.),
-          Real_t(-1.), Real_t(-1.), Real_t( 1.), Real_t( 1.) },
-
-        { Real_t( 1.), Real_t(-1.), Real_t(-1.), Real_t( 1.),
-          Real_t(-1.), Real_t( 1.), Real_t( 1.), Real_t(-1.) },
-
-        { Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.),
-          Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) },
-
-        { Real_t(-1.), Real_t( 1.), Real_t(-1.), Real_t( 1.),
-          Real_t( 1.), Real_t(-1.), Real_t( 1.), Real_t(-1.) }
-      } ;
-
-      const Index_t *elemToNode = domain->nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain->ss(i2);
-      mass1=domain->elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain->xd(n0si2);
-      xd1[1] = domain->xd(n1si2);
-      xd1[2] = domain->xd(n2si2);
-      xd1[3] = domain->xd(n3si2);
-      xd1[4] = domain->xd(n4si2);
-      xd1[5] = domain->xd(n5si2);
-      xd1[6] = domain->xd(n6si2);
-      xd1[7] = domain->xd(n7si2);
-
-      yd1[0] = domain->yd(n0si2);
-      yd1[1] = domain->yd(n1si2);
-      yd1[2] = domain->yd(n2si2);
-      yd1[3] = domain->yd(n3si2);
-      yd1[4] = domain->yd(n4si2);
-      yd1[5] = domain->yd(n5si2);
-      yd1[6] = domain->yd(n6si2);
-      yd1[7] = domain->yd(n7si2);
-
-      zd1[0] = domain->zd(n0si2);
-      zd1[1] = domain->zd(n1si2);
-      zd1[2] = domain->zd(n2si2);
-      zd1[3] = domain->zd(n3si2);
-      zd1[4] = domain->zd(n4si2);
-      zd1[5] = domain->zd(n5si2);
-      zd1[6] = domain->zd(n6si2);
-      zd1[7] = domain->zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1, hourgam, coefficient,
-#if !defined(OMP_FINE_SYNC)
-                               hgfx, hgfy, hgfz
-#else
-                               &fx_elem[i3], &fy_elem[i3], &fz_elem[i3]
-#endif
-                              );
-
-#if !defined(OMP_FINE_SYNC)
-      domain->fx(n0si2) += hgfx[0];
-      domain->fy(n0si2) += hgfy[0];
-      domain->fz(n0si2) += hgfz[0];
-
-      domain->fx(n1si2) += hgfx[1];
-      domain->fy(n1si2) += hgfy[1];
-      domain->fz(n1si2) += hgfz[1];
-
-      domain->fx(n2si2) += hgfx[2];
-      domain->fy(n2si2) += hgfy[2];
-      domain->fz(n2si2) += hgfz[2];
-
-      domain->fx(n3si2) += hgfx[3];
-      domain->fy(n3si2) += hgfy[3];
-      domain->fz(n3si2) += hgfz[3];
-
-      domain->fx(n4si2) += hgfx[4];
-      domain->fy(n4si2) += hgfy[4];
-      domain->fz(n4si2) += hgfz[4];
-
-      domain->fx(n5si2) += hgfx[5];
-      domain->fy(n5si2) += hgfy[5];
-      domain->fz(n5si2) += hgfz[5];
-
-      domain->fx(n6si2) += hgfx[6];
-      domain->fy(n6si2) += hgfy[6];
-      domain->fz(n6si2) += hgfz[6];
-
-      domain->fx(n7si2) += hgfx[7];
-      domain->fy(n7si2) += hgfy[7];
-      domain->fz(n7si2) += hgfz[7];
-#endif
-   } );
-
-#if defined(OMP_FINE_SYNC)
-   // Collect the data from the local arrays into the final force arrays
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-                                  [=] RAJA_DEVICE (int gnode) {
-      Index_t count = domain->nodeElemCount(gnode) ;
-      Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-      Real_t fx_sum = Real_t(0.0) ;
-      Real_t fy_sum = Real_t(0.0) ;
-      Real_t fz_sum = Real_t(0.0) ;
-      for (Index_t i=0 ; i < count ; ++i) {
-         Index_t ielem = cornerList[i] ;
-         fx_sum += fx_elem[ielem] ;
-         fy_sum += fy_elem[ielem] ;
-         fz_sum += fz_elem[ielem] ;
-      }
-      domain->fx(gnode) += fx_sum ;
-      domain->fy(gnode) += fy_sum ;
-      domain->fz(gnode) += fz_sum ;
-   } );
-
-   elemMemPool.release(&fz_elem) ;
-   elemMemPool.release(&fy_elem) ;
-   elemMemPool.release(&fx_elem) ;
-#endif
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain* domain,
-                                  Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = elemMemPool.allocate(numElem8) ;
-   Real_t *dvdy = elemMemPool.allocate(numElem8) ;
-   Real_t *dvdz = elemMemPool.allocate(numElem8) ;
-   Real_t *x8n  = elemMemPool.allocate(numElem8) ;
-   Real_t *y8n  = elemMemPool.allocate(numElem8) ;
-   Real_t *z8n  = elemMemPool.allocate(numElem8) ;
-
-   // For negative element volume check
-   RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-        [=] RAJA_DEVICE (int i) {
-#if 1
-      /* This variant makes overall runtime 2% faster on CPU */
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain->nodelist(i);
-      CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii) {
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-#else
-      /* This variant is likely GPU friendly */
-      Index_t* elemToNode = domain->nodelist(i);
-      CollectDomainNodesToElemNodes(domain, elemToNode,
-                                    &x8n[8*i], &y8n[8*i], &z8n[8*i]);
-
-      CalcElemVolumeDerivative(&dvdx[8*i], &dvdy[8*i], &dvdz[8*i],
-                               &x8n[8*i], &y8n[8*i], &z8n[8*i]);
-#endif
-
-      determ[i] = domain->volo(i) * domain->v(i);
-
-      minvol.min(domain->v(i));
-
-   } );
-
-   /* Do a check for negative volumes */
-   if ( Real_t(minvol) <= Real_t(0.0) ) {
-#if USE_MPI         
-      MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-      exit(VolumeError);
-#endif
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, numElem ) ;
-   }
-
-   elemMemPool.release(&z8n) ;
-   elemMemPool.release(&y8n) ;
-   elemMemPool.release(&x8n) ;
-   elemMemPool.release(&dvdz) ;
-   elemMemPool.release(&dvdy) ;
-   elemMemPool.release(&dvdx) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef() ;
-      Real_t *sigxx  = elemMemPool.allocate(numElem) ;
-      Real_t *sigyy  = elemMemPool.allocate(numElem) ;
-      Real_t *sigzz  = elemMemPool.allocate(numElem) ;
-      Real_t *determ = elemMemPool.allocate(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain, sigxx, sigyy, sigzz);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain,
-                               sigxx, sigyy, sigzz, determ, numElem );
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-      RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-           [=] RAJA_DEVICE (int k) {
-         minvol.min(determ[k]);
-      } );
-
-      if (Real_t(minvol) <= Real_t(0.0)) {
-#if USE_MPI            
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      elemMemPool.release(&determ) ;
-      elemMemPool.release(&sigzz) ;
-      elemMemPool.release(&sigyy) ;
-      elemMemPool.release(&sigxx) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE void CalcForceForNodes(Domain* domain)
-{
-#if USE_MPI  
-  CommRecv(*domain, MSG_COMM_SBN, 3,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-           true, false) ;
-#endif  
-
-  RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-       [=] RAJA_DEVICE (int i) {
-     domain->fx(i) = Real_t(0.0) ;
-     domain->fy(i) = Real_t(0.0) ;
-     domain->fz(i) = Real_t(0.0) ;
-  } );
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-#if USE_MPI  
-  Domain_member fieldData[3] ;
-  fieldData[0] = &Domain::fx ;
-  fieldData[1] = &Domain::fy ;
-  fieldData[2] = &Domain::fz ;
-  
-  CommSend(*domain, MSG_COMM_SBN, 3, fieldData,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() +  1,
-           true, false) ;
-  CommSBN(*domain, 3, fieldData) ;
-#endif  
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Domain* domain)
-{
-   
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-        [=] RAJA_DEVICE (int i) {
-      domain->xdd(i) = domain->fx(i) / domain->nodalMass(i);
-      domain->ydd(i) = domain->fy(i) / domain->nodalMass(i);
-      domain->zdd(i) = domain->fz(i) / domain->nodalMass(i);
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Domain* domain)
-{
-   RAJA::forall<symnode_exec_policy>(domain->getXSymNodeISet(),
-        [=] RAJA_DEVICE (int i) {
-      domain->xdd(i) = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(domain->getYSymNodeISet(),
-        [=] RAJA_DEVICE (int i) {
-      domain->ydd(i) = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(domain->getZSymNodeISet(),
-        [=] RAJA_DEVICE (int i) {
-      domain->zdd(i) = Real_t(0.0) ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Domain* domain, const Real_t dt, const Real_t u_cut)
-{
-
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-       [=] RAJA_DEVICE (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain->xd(i) + domain->xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain->xd(i) = xdtmp ;
-
-     ydtmp = domain->yd(i) + domain->ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain->yd(i) = ydtmp ;
-
-     zdtmp = domain->zd(i) + domain->zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain->zd(i) = zdtmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPositionForNodes(Domain* domain, const Real_t dt)
-{
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(),
-       [=] RAJA_DEVICE (int i) {
-     domain->x(i) += domain->xd(i) * dt ;
-     domain->y(i) += domain->yd(i) * dt ;
-     domain->z(i) += domain->zd(i) * dt ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeNodal(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   Domain_member fieldData[6] ;
-#endif
-
-   const Real_t delt = domain->deltatime() ;
-   Real_t u_cut = domain->u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-#if USE_MPI  
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif
-   
-   CalcAccelerationForNodes(domain);
-   
-   ApplyAccelerationBoundaryConditionsForNodes(domain);
-
-   CalcVelocityForNodes( domain, delt, u_cut) ;
-
-   CalcPositionForNodes( domain, delt );
-#if USE_MPI
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-  fieldData[0] = &Domain::x ;
-  fieldData[1] = &Domain::y ;
-  fieldData[2] = &Domain::z ;
-  fieldData[3] = &Domain::xd ;
-  fieldData[4] = &Domain::yd ;
-  fieldData[5] = &Domain::zd ;
-
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-   CommSyncPosVel(*domain) ;
-#endif
-#endif
-   
-  return;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-/******************************************/
-
-//inline
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = MAX(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = MAX(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-RAJA_DEVICE
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-/******************************************/
-
-//RAJA_STORAGE
-void CalcKinematicsForElems( Domain* domain,
-                             Real_t deltaTime, Index_t numElem )
-{
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-      [=] RAJA_DEVICE (int k) { 
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain->nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain->volo(k) ;
-    domain->vnew(k) = relativeVolume ;
-    domain->delv(k) = relativeVolume - domain->v(k) ;
-
-    // set characteristic length
-    domain->arealg(k) = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain->xd(gnode);
-      yd_local[lnode] = domain->yd(gnode);
-      zd_local[lnode] = domain->zd(gnode); 
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGradient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain->dxx(k) = D[0];
-    domain->dyy(k) = D[1];
-    domain->dzz(k) = D[2];
-  } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime() ;
-
-      domain->AllocateStrains(elemMemPool, numElem);
-
-      CalcKinematicsForElems(domain, deltatime, numElem) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-           [=] RAJA_DEVICE (int k) {
-         // calc strain rate and apply as constraint (only done in FB element)
-         Real_t vdov = domain->dxx(k) + domain->dyy(k) + domain->dzz(k) ;
-         Real_t vdovthird = vdov/Real_t(3.0) ;
-
-         // make the rate of deformation tensor deviatoric
-         domain->vdov(k) = vdov ;
-         domain->dxx(k) -= vdovthird ;
-         domain->dyy(k) -= vdovthird ;
-         domain->dzz(k) -= vdovthird ;
-
-         minvol.min(domain->vnew(k));
-      } );
-
-      if (Real_t(minvol) <= Real_t(0.0)) {
-#if USE_MPI           
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-
-      domain->DeallocateStrains(elemMemPool);
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem();
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-        [=] RAJA_DEVICE (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain->nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain->x(n0) ;
-      Real_t x1 = domain->x(n1) ;
-      Real_t x2 = domain->x(n2) ;
-      Real_t x3 = domain->x(n3) ;
-      Real_t x4 = domain->x(n4) ;
-      Real_t x5 = domain->x(n5) ;
-      Real_t x6 = domain->x(n6) ;
-      Real_t x7 = domain->x(n7) ;
-
-      Real_t y0 = domain->y(n0) ;
-      Real_t y1 = domain->y(n1) ;
-      Real_t y2 = domain->y(n2) ;
-      Real_t y3 = domain->y(n3) ;
-      Real_t y4 = domain->y(n4) ;
-      Real_t y5 = domain->y(n5) ;
-      Real_t y6 = domain->y(n6) ;
-      Real_t y7 = domain->y(n7) ;
-
-      Real_t z0 = domain->z(n0) ;
-      Real_t z1 = domain->z(n1) ;
-      Real_t z2 = domain->z(n2) ;
-      Real_t z3 = domain->z(n3) ;
-      Real_t z4 = domain->z(n4) ;
-      Real_t z5 = domain->z(n5) ;
-      Real_t z6 = domain->z(n6) ;
-      Real_t z7 = domain->z(n7) ;
-
-      Real_t xv0 = domain->xd(n0) ;
-      Real_t xv1 = domain->xd(n1) ;
-      Real_t xv2 = domain->xd(n2) ;
-      Real_t xv3 = domain->xd(n3) ;
-      Real_t xv4 = domain->xd(n4) ;
-      Real_t xv5 = domain->xd(n5) ;
-      Real_t xv6 = domain->xd(n6) ;
-      Real_t xv7 = domain->xd(n7) ;
-
-      Real_t yv0 = domain->yd(n0) ;
-      Real_t yv1 = domain->yd(n1) ;
-      Real_t yv2 = domain->yd(n2) ;
-      Real_t yv3 = domain->yd(n3) ;
-      Real_t yv4 = domain->yd(n4) ;
-      Real_t yv5 = domain->yd(n5) ;
-      Real_t yv6 = domain->yd(n6) ;
-      Real_t yv7 = domain->yd(n7) ;
-
-      Real_t zv0 = domain->zd(n0) ;
-      Real_t zv1 = domain->zd(n1) ;
-      Real_t zv2 = domain->zd(n2) ;
-      Real_t zv3 = domain->zd(n3) ;
-      Real_t zv4 = domain->zd(n4) ;
-      Real_t zv5 = domain->zd(n5) ;
-      Real_t zv6 = domain->zd(n6) ;
-      Real_t zv7 = domain->zd(n7) ;
-
-      Real_t vol = domain->volo(i)*domain->vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
-      Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
-      Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
-      Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
-      Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
-      Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
-      Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain->delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
-      dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
-      dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
-
-      domain->delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain->delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
-      dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
-      dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
-
-      domain->delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain->delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
-      dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
-      dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
-
-      domain->delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(Domain* domain, Int_t r,
-                                  Real_t ptiny)
-{
-   Real_t monoq_limiter_mult = domain->monoq_limiter_mult();
-   Real_t monoq_max_slope = domain->monoq_max_slope();
-   Real_t qlc_monoq = domain->qlc_monoq();
-   Real_t qqc_monoq = domain->qqc_monoq();
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(r),
-        [=] RAJA_DEVICE (int ielem) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = domain->elemBC(ielem) ;
-      Real_t delvm = 0.0, delvp =0.0;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / (domain->delv_xi(ielem)+ ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case XI_M_COMM: /* needs comm data */
-         case 0:         delvm = domain->delv_xi(domain->lxim(ielem)); break ;
-         case XI_M_SYMM: delvm = domain->delv_xi(ielem) ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:        /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & XI_P) {
-         case XI_P_COMM: /* needs comm data */
-         case 0:         delvp = domain->delv_xi(domain->lxip(ielem)) ; break ;
-         case XI_P_SYMM: delvp = domain->delv_xi(ielem) ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:        /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain->delv_eta(ielem) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case ETA_M_COMM: /* needs comm data */
-         case 0:          delvm = domain->delv_eta(domain->letam(ielem)) ; break ;
-         case ETA_M_SYMM: delvm = domain->delv_eta(ielem) ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:         /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ETA_P) {
-         case ETA_P_COMM: /* needs comm data */
-         case 0:          delvp = domain->delv_eta(domain->letap(ielem)) ; break ;
-         case ETA_P_SYMM: delvp = domain->delv_eta(ielem) ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:         /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain->delv_zeta(ielem) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case ZETA_M_COMM: /* needs comm data */
-         case 0:           delvm = domain->delv_zeta(domain->lzetam(ielem)) ; break ;
-         case ZETA_M_SYMM: delvm = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ZETA_P) {
-         case ZETA_P_COMM: /* needs comm data */
-         case 0:           delvp = domain->delv_zeta(domain->lzetap(ielem)) ; break ;
-         case ZETA_P_SYMM: delvp = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          /* fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__); */
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain->vdov(ielem) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain->delv_xi(ielem)   * domain->delx_xi(ielem)   ;
-         Real_t delvxeta  = domain->delv_eta(ielem)  * domain->delx_eta(ielem)  ;
-         Real_t delvxzeta = domain->delv_zeta(ielem) * domain->delx_zeta(ielem) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain->elemMass(ielem) / (domain->volo(ielem) * domain->vnew(ielem)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain->qq(ielem) = qquad ;
-      domain->ql(ielem) = qlin  ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain* domain)
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   //
-   // calculate the monotonic q for all regions
-   //
-   for (Index_t r=0 ; r<domain->numReg() ; ++r) {
-      if (domain->regElemSize(r) > 0) {
-         CalcMonotonicQRegionForElems(domain, r, ptiny) ;
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcQForElems(Domain* domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem() ;
-
-   if (numElem != 0) {
-      Int_t allElem = numElem +  /* local elem */
-            2*domain->sizeX()*domain->sizeY() + /* plane ghosts */
-            2*domain->sizeX()*domain->sizeZ() + /* row ghosts */
-            2*domain->sizeY()*domain->sizeZ() ; /* col ghosts */
-
-      domain->AllocateGradients(elemMemPool, numElem, allElem);
-
-#if USE_MPI
-      CommRecv(*domain, MSG_MONOQ, 3,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-#endif      
-
-      /* Calculate velocity gradients */
-      CalcMonotonicQGradientsForElems(domain);
-
-#if USE_MPI      
-      Domain_member fieldData[3] ;
-      
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      fieldData[0] = &Domain::delv_xi ;
-      fieldData[1] = &Domain::delv_eta ;
-      fieldData[2] = &Domain::delv_zeta ;
-
-      CommSend(*domain, MSG_MONOQ, 3, fieldData,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-
-      CommMonoQ(*domain) ;
-#endif      
-
-      CalcMonotonicQForElems(domain) ;
-
-      // Free up memory
-      domain->DeallocateGradients(elemMemPool);
-
-      /* Don't allow excessive artificial viscosity */
-      RAJA::ReduceMax<reduce_policy, Real_t>
-             maxQ(domain->qstop() - Real_t(1.0)) ;
-      RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-           [=] RAJA_DEVICE (int ielem) {
-         maxQ.max(domain->q(ielem)) ;
-      } ) ;
-
-      if ( Real_t(maxQ) > domain->qstop() ) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
-#else
-         exit(QStopError);
-#endif
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {
-      Real_t const  c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[ielem] = c1s * (compression[ielem] + Real_t(1.));
-      pbvc[ielem] = c1s;
-   } );
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {
-      p_new[ielem] = bvc[ielem] * e_old[ielem] ;
-
-      if    (FABS(p_new[ielem]) <  p_cut   )
-         p_new[ielem] = Real_t(0.0) ;
-
-      if    ( vnewc[ielem] >= eosvmax ) /* impossible condition here? */
-         p_new[ielem] = Real_t(0.0) ;
-
-      if    (p_new[ielem]       <  pmin)
-         p_new[ielem]   = pmin ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcEnergyForElems(Domain* domain,
-                        Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t *pHalfStep,
-                        Real_t pmin, Real_t p_cut, Real_t  e_cut,
-                        Real_t q_cut, Real_t emin,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {  
-      e_new[ielem] = domain->e(ielem)
-         - Real_t(0.5) * domain->delv(ielem) * (p_old[ielem] + domain->q(ielem))
-         + Real_t(0.5) * work[ielem];
-
-      if (e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {  
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[ielem]) ;
-
-      if ( domain->delv(ielem) > Real_t(0.) ) {
-         q_new[ielem] /* = domain->qq(ielem) = domain->ql(ielem) */ = Real_t(0.);
-      }
-      else {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-                 + vhalf * vhalf * bvc[ielem] * pHalfStep[ielem] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[ielem] = (ssc*domain->ql(ielem) + domain->qq(ielem)) ;
-      }
-
-      e_new[ielem] = e_new[ielem] + Real_t(0.5) * domain->delv(ielem)
-         * (  Real_t(3.0)*(p_old[ielem]     + domain->q(ielem))
-              - Real_t(4.0)*(pHalfStep[ielem] + q_new[ielem])) ;
-   } );
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {  
-      e_new[ielem] += Real_t(0.5) * work[ielem];
-
-      if (FABS(e_new[ielem]) < e_cut) {
-         e_new[ielem] = Real_t(0.)  ;
-      }
-      if (     e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {  
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Real_t q_tilde ;
-
-      if (domain->delv(ielem) > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[ielem] * p_new[ielem] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*domain->ql(ielem) + domain->qq(ielem)) ;
-      }
-
-      e_new[ielem] -= (  Real_t(7.0)*(p_old[ielem]     + domain->q(ielem))
-                       - Real_t(8.0)*(pHalfStep[ielem] + q_new[ielem])
-                       + (p_new[ielem] + q_tilde)) * domain->delv(ielem)*sixth ;
-
-      if (FABS(e_new[ielem]) < e_cut) {
-         e_new[ielem] = Real_t(0.)  ;
-      }
-      if (     e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {
-      if ( domain->delv(ielem) <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-            + vnewc[ielem] * vnewc[ielem] * bvc[ielem] * p_new[ielem] ) / rho0;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[ielem] = (ssc*domain->ql(ielem) + domain->qq(ielem)) ;
-
-         if (FABS(q_new[ielem]) < q_cut) q_new[ielem] = Real_t(0.) ;
-      }
-   } );
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(Domain* domain,
-                            Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3,
-                            LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (int ielem) {
-      Real_t ssTmp = (pbvc[ielem] * enewc[ielem] + vnewc[ielem] * vnewc[ielem] *
-                 bvc[ielem] * pnewc[ielem]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      domain->ss(ielem) = ssTmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain* domain,
-                     Real_t *vnewc, Real_t *p_old,
-                     Real_t *compression, Real_t *compHalfStep,
-                     Real_t *work, Real_t *p_new, Real_t *e_new,
-                     Real_t *q_new, Real_t *bvc, Real_t *pbvc,
-                     Real_t *pHalfStep, Int_t reg_num, Int_t rep)
-{
-   Real_t  e_cut = domain->e_cut() ;
-   Real_t  p_cut = domain->p_cut() ;
-   Real_t  ss4o3 = domain->ss4o3() ;
-   Real_t  q_cut = domain->q_cut() ;
-
-   Real_t eosvmax = domain->eosvmax() ;
-   Real_t eosvmin = domain->eosvmin() ;
-   Real_t pmin    = domain->pmin() ;
-   Real_t emin    = domain->emin() ;
-   Real_t rho0    = domain->refdens() ;
-
-   LULESH_ISET& regISet = domain->getRegionISet(reg_num);
-   Int_t numElemReg = regISet.getLength();
- 
-   //loop to add load imbalance based on region number 
-   for(Int_t j = 0; j < rep; j++) {
-      /* compress data, minimal set */
-      RAJA::forall<mat_exec_policy>(regISet,
-           [=] RAJA_DEVICE (Index_t ielem) {
-         p_old[ielem] = domain->p(ielem) ;
-         work[ielem] = Real_t(0.0) ;
-      } );
-
-      RAJA::forall<mat_exec_policy>(regISet,
-           [=] RAJA_DEVICE (Index_t ielem) {
-         Real_t vchalf ;
-         compression[ielem] = Real_t(1.) / vnewc[ielem] - Real_t(1.);
-         vchalf = vnewc[ielem] - domain->delv(ielem) * Real_t(.5);
-         compHalfStep[ielem] = Real_t(1.) / vchalf - Real_t(1.);
-      } );
-
-      /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(regISet,
-              [=] RAJA_DEVICE (Index_t ielem) {
-            if (vnewc[ielem] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[ielem] = compression[ielem] ;
-            }
-         } );
-      }
-
-      if ( eosvmax != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(regISet,
-              [=] RAJA_DEVICE (Index_t ielem) {
-            if (vnewc[ielem] >= eosvmax) { /* impossible due to calling func? */
-               p_old[ielem]        = Real_t(0.) ;
-               compression[ielem]  = Real_t(0.) ;
-               compHalfStep[ielem] = Real_t(0.) ;
-            }
-         } );
-      }
-
-      CalcEnergyForElems(domain, p_new, e_new, q_new, bvc, pbvc,
-                         p_old, compression, compHalfStep,
-                         vnewc, work, pHalfStep, pmin,
-                         p_cut, e_cut, q_cut, emin,
-                         rho0, eosvmax,
-                         regISet);
-   }
-
-   RAJA::forall<mat_exec_policy>(regISet,
-        [=] RAJA_DEVICE (Index_t ielem) {
-      domain->p(ielem) = p_new[ielem] ;
-      domain->e(ielem) = e_new[ielem] ;
-      domain->q(ielem) = q_new[ielem] ;
-   } );
-
-   CalcSoundSpeedForElems(domain,
-                          vnewc, rho0, e_new, p_new,
-                          pbvc, bvc, ss4o3,
-                          regISet) ;
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin() ;
-    Real_t eosvmax = domain->eosvmax() ;
-    Real_t *vnewc = elemMemPool.allocate(numElem) ;
-    Real_t *p_old = elemMemPool.allocate(numElem) ;
-    Real_t *compression = elemMemPool.allocate(numElem) ;
-    Real_t *compHalfStep = elemMemPool.allocate(numElem) ;
-    Real_t *work = elemMemPool.allocate(numElem) ;
-    Real_t *p_new = elemMemPool.allocate(numElem) ;
-    Real_t *e_new = elemMemPool.allocate(numElem) ;
-    Real_t *q_new = elemMemPool.allocate(numElem) ;
-    Real_t *bvc = elemMemPool.allocate(numElem) ;
-    Real_t *pbvc = elemMemPool.allocate(numElem) ;
-    Real_t *pHalfStep = elemMemPool.allocate(numElem) ;
-
-
-    RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-         [=] RAJA_DEVICE (int i) {
-       vnewc[i] = domain->vnew(i) ;
-    } );
-
-    // Bound the updated relative volumes with eosvmin/max
-    if (eosvmin != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(domain->getElemISet(), 
-            [=] RAJA_DEVICE (int i) {
-          if (vnewc[i] < eosvmin)
-             vnewc[i] = eosvmin ;
-       } );
-    }
-
-    if (eosvmax != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-            [=] RAJA_DEVICE (int i) {
-          if (vnewc[i] > eosvmax)
-             vnewc[i] = eosvmax ;
-       } );
-    }
-
-    // check for negative element volume
-    RAJA::ReduceMin<reduce_policy, Real_t> minvol(Real_t(1.0e+20));
-
-    // This check may not make perfect sense in LULESH, but
-    // it's representative of something in the full code -
-    // just leave it in, please
-    RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-         [=] RAJA_DEVICE (int i) {
-       Real_t vc = domain->v(i) ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = -1.0 ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = -1.0 ;
-       }
-
-       minvol.min(vc);
-    } );
-
-    if (Real_t(minvol) <= 0.) {
-#if USE_MPI             
-       MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-       exit(VolumeError);
-#endif
-    }
-
-    for (Int_t reg_num=0 ; reg_num < domain->numReg() ; reg_num++) {
-       Int_t rep;
-       //Determine load imbalance for this region
-       //round down the number with lowest cost
-       if(reg_num < domain->numReg()/2)
-	 rep = 1;
-       //you don't get an expensive region unless you at least have 5 regions
-       else if(reg_num < (domain->numReg() - (domain->numReg()+15)/20))
-         rep = 1 + domain->cost();
-       //very expensive regions
-       else
-	 rep = 10 * (1+ domain->cost());
-       EvalEOSForElems(domain, vnewc, p_old, compression, compHalfStep,
-                       work, p_new, e_new, q_new, bvc, pbvc, pHalfStep,
-                       reg_num, rep);
-    }
-
-    elemMemPool.release(&pHalfStep) ;
-    elemMemPool.release(&pbvc) ;
-    elemMemPool.release(&bvc) ;
-    elemMemPool.release(&q_new) ;
-    elemMemPool.release(&e_new) ;
-    elemMemPool.release(&p_new) ;
-    elemMemPool.release(&work) ;
-    elemMemPool.release(&compHalfStep) ;
-    elemMemPool.release(&compression) ;
-    elemMemPool.release(&p_old) ;
-    elemMemPool.release(&vnewc) ;
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void UpdateVolumesForElems(Domain* domain, 
-                           Real_t v_cut)
-{
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(),
-        [=] RAJA_DEVICE (int i) { 
-      Real_t tmpV = domain->vnew(i) ;
-
-      if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-         tmpV = Real_t(1.0) ;
-
-      domain->v(i) = tmpV ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeElements(Domain* domain, Index_t numElem)
-{
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain,
-                        domain->v_cut()) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(Domain* domain, int reg_num,
-                                   Real_t qqc, Real_t& dtcourant)
-{
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(dtcourant) ;
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(reg_num),
-        [=] RAJA_DEVICE (int indx) {
-
-      Real_t dtf = domain->ss(indx) * domain->ss(indx) ;
-
-      if ( domain->vdov(indx) < Real_t(0.) ) {
-         dtf += qqc2 * domain->arealg(indx) * domain->arealg(indx) *
-                domain->vdov(indx) * domain->vdov(indx) ;
-      }
-
-      Real_t dtf_cmp = (domain->vdov(indx) != Real_t(0.))
-                     ?  domain->arealg(indx) / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(Domain* domain, int reg_num,
-                                 Real_t dvovmax, Real_t& dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(dthydro) ;
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(reg_num),
-         [=] RAJA_DEVICE (int indx) {
-
-       Real_t dtvov_cmp = (domain->vdov(indx) != Real_t(0.))
-                        ? (dvovmax / (FABS(domain->vdov(indx))+Real_t(1.e-20)))
-                        : Real_t(1.0e+20) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain* domain) {
-
-   // Initialize conditions to a very large value
-   domain->dtcourant() = 1.0e+20;
-   domain->dthydro() = 1.0e+20;
-
-   for (Index_t reg_num=0 ; reg_num < domain->numReg() ; ++reg_num) {
-      /* evaluate time constraint */
-      CalcCourantConstraintForElems(domain, reg_num,
-                                    domain->qqc(),
-                                    domain->dtcourant()) ;
-
-      /* check hydro constraint */
-      CalcHydroConstraintForElems(domain, reg_num,
-                                  domain->dvovmax(),
-                                  domain->dthydro()) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   Domain_member fieldData[6] ;
-#endif
-
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-#endif
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem());
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ; 
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-   
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif   
-
-   CalcTimeConstraintsForElems(domain);
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommSyncPosVel(*domain) ;
-#endif
-#endif   
-}
-
-
-/******************************************/
-
-int main(int argc, char *argv[])
-{
-   Domain *locDom ;
-   Int_t numRanks ;
-   Int_t myRank ;
-   struct cmdLineOpts opts;
-
-#if USE_MPI   
-   Domain_member fieldData ;
-
-   MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-#else
-   numRanks = 1;
-   myRank = 0;
-#endif   
-
-   /* Set defaults that can be overridden by command line opts */
-   opts.its = 9999999;
-   opts.nx  = 30;
-   opts.numReg = 11;
-   opts.numFiles = (int)(numRanks+10)/9;
-   opts.showProg = 0;
-   opts.quiet = 0;
-   opts.viz = 0;
-   opts.balance = 1;
-   opts.cost = 1;
-
-   ParseCommandLineOptions(argc, argv, myRank, &opts);
-
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      printf("Running problem size %d^3 per domain until completion\n", opts.nx);
-      printf("Num processors: %d\n", numRanks);
-#if defined(_OPENMP)
-      printf("Num threads: %d\n", omp_get_max_threads());
-#endif
-      printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
-      printf("To run other sizes, use -s <integer>.\n");
-      printf("To run a fixed number of iterations, use -i <integer>.\n");
-      printf("To run a more or less balanced region set, use -b <integer>.\n");
-      printf("To change the relative costs of regions, use -c <integer>.\n");
-      printf("To print out progress, use -p\n");
-      printf("To write an output file for VisIt, use -v\n");
-      printf("See help (-h) for more options\n\n");
-   }
-
-   // Set up the mesh and decompose. Assumes regular cubes for now
-   Int_t col, row, plane, side;
-   InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
-
-   // Build the main data structure and initialize it
-   locDom = new Domain(numRanks, col, row, plane, opts.nx,
-                       side, opts.numReg, opts.balance, opts.cost) ;
-
-
-#if USE_MPI   
-   fieldData = &Domain::nodalMass ;
-
-   // Initial domain boundary communication 
-   CommRecv(*locDom, MSG_COMM_SBN, 1,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
-            true, false) ;
-   CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
-            true, false) ;
-   CommSBN(*locDom, 1, &fieldData) ;
-
-   // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
-#endif   
-   // BEGIN timestep to solution */
-#ifdef RAJA_USE_CALIPER
-   RAJA::Timer timer_main; 
-   timer_main.start("timer_main");
-#else
-#if USE_MPI   
-   double start = MPI_Wtime();
-#else
-   timeval start;
-   gettimeofday(&start, NULL) ;
-#endif
-#endif
-//debug to see region sizes
-// for(Int_t i = 0; i < locDom->numReg(); i++) {
-//    std::cout << "region " << i + 1<< " size = " << locDom->regElemSize(i) << std::endl;
-//    RAJA::forall<mat_exec_policy>(locDom->getRegionISet(i), [=] (int idx) { printf("%d ", idx) ; }) ;
-//    printf("\n\n") ;
-// }
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
-
-      TimeIncrement(*locDom) ;
-      LagrangeLeapFrog(locDom) ;
-
-      if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
-      }
-   }
-double elapsed_time;
-#ifdef RAJA_USE_CALIPER
-   // Use reduced max elapsed time
-   timer_main.stop("timer_main");
-   elapsed_time = timer_main.elapsed();
-#else
-#if USE_MPI   
-   elapsed_time = MPI_Wtime() - start;
-#else
-   timeval end;
-   gettimeofday(&end, NULL) ;
-   elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
-#endif
-#endif
-   double elapsed_timeG;
-#if USE_MPI   
-   MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-   elapsed_timeG = elapsed_time;
-#endif
-
-   // Write out final viz file */
-   if (opts.viz) {
-      DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
-   }
-   
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
-   }
-
-   delete locDom;
-
-#if USE_MPI
-   MPI_Finalize() ;
-#endif
-
-   return 0 ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.h
deleted file mode 100644
index ffc08ece4..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh.h
+++ /dev/null
@@ -1,22 +0,0 @@
-
-#include "RAJA/RAJA.hxx"
-#include "luleshPolicy.hxx"
-#include "luleshMemory.hxx"
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-/* if luleshPolicy.hxx USE_CASE >= 9, must use lulesh_ptr.h */
-#if USE_CASE >= LULESH_CUDA_CANONICAL
-#if defined(LULESH_HEADER)
-#undef LULESH_HEADER
-#endif
-#define LULESH_HEADER 1
-#endif
-
-#if !defined(LULESH_HEADER)
-#include "lulesh_stl.h"
-#elif (LULESH_HEADER == 1)
-#include "lulesh_ptr.h"
-#else
-#include "lulesh_tuple.h"
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshMemory.hxx b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshMemory.hxx
deleted file mode 100644
index 274f8a5d3..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshMemory.hxx
+++ /dev/null
@@ -1,187 +0,0 @@
-// This work was performed under the auspices of the U.S. Department of Energy by
-// Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
-//
-
-//
-// ALLOCATE/RELEASE FUNCTIONS 
-//
-
-#if defined(RAJA_ENABLE_CUDA) // CUDA managed memory allocate/release
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   T *retVal ;
-   cudaErrchk( cudaMallocManaged((void **)&retVal, sizeof(T)*size, cudaMemAttachGlobal) ) ;
-   return retVal ;
-}
-
-template <typename EXEC_POLICY_T, typename T>
-inline T *AllocateTouch(RAJA::IndexSet *is, size_t size)
-{
-   T *retVal ;
-   cudaErrchk( cudaMallocManaged((void **)&retVal, sizeof(T)*size, cudaMemAttachGlobal) ) ;
-   cudaMemset(retVal,0,sizeof(T)*size);
-   return retVal ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      cudaErrchk( cudaFree(*ptr) ) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      cudaErrchk( cudaFree(*ptr) ) ;
-      *ptr = NULL ;
-   }
-}
-
-
-#else  // Standard CPU memory allocate/release
-
-#include <cstdlib>
-#include <cstring>
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   T *retVal ;
-   posix_memalign((void **)&retVal, RAJA::DATA_ALIGN, sizeof(T)*size);
-// memset(retVal,0,sizeof(T)*size);
-   return retVal ;
-}
-
-template <typename EXEC_POLICY_T, typename T>
-inline T *AllocateTouch(RAJA::IndexSet *is, size_t size)
-{
-   T *retVal ;
-   posix_memalign((void **)&retVal, RAJA::DATA_ALIGN, sizeof(T)*size);
-
-   /* we should specialize by policy type here */
-   RAJA::forall<EXEC_POLICY_T>( *is, [=] RAJA_DEVICE (int i) {
-      retVal[i] = 0 ;
-   } ) ;
-
-   return retVal ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-#endif 
-
-
-/**********************************/
-/* Memory Pool                    */
-/**********************************/
-
-namespace RAJA {
-
-template <typename VARTYPE >
-struct MemoryPool {
-public:
-   MemoryPool()
-   {
-      for (int i=0; i<32; ++i) {
-         lenType[i] = 0 ;
-         ptr[i] = 0 ;
-      }
-   }
-
-   VARTYPE *allocate(int len) {
-      VARTYPE *retVal ;
-      int i ;
-      for (i=0; i<32; ++i) {
-         if (lenType[i] == len) {
-            lenType[i] = -lenType[i] ;
-            retVal = ptr[i] ;
-#if 0
-            /* migrate smallest lengths to be first in list */
-            /* since longer lengths can amortize lookup cost */
-            if (i > 0) {
-               if (len < abs(lenType[i-1])) {
-                  lenType[i] = lenType[i-1] ;
-                  ptr[i] = ptr[i-1] ;
-                  lenType[i-1] = -len ;
-                  ptr[i-1] = retVal ;
-               }
-            }
-#endif
-            break ;
-         }
-         else if (lenType[i] == 0) {
-            lenType[i] = -len ;
-            ptr[i] = Allocate<VARTYPE>(len) ;
-            retVal = ptr[i] ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         retVal = 0 ;  /* past max available pointers */
-      }
-      return retVal ;
-   }
-
-   bool release(VARTYPE **oldPtr) {
-      int i ;
-      bool success = true ;
-      for (i=0; i<32; ++i) {
-         if (ptr[i] == *oldPtr) {
-            lenType[i] = -lenType[i] ;
-            *oldPtr = 0 ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         success = false ; /* error -- not found */
-      }
-      return success ;
-   }
-
-   bool release(VARTYPE * __restrict__ *oldPtr) {
-      int i ;
-      bool success = true ;
-      for (i=0; i<32; ++i) {
-         if (ptr[i] == *oldPtr) {
-            lenType[i] = -lenType[i] ;
-            *oldPtr = 0 ;
-            break ;
-         }
-      }
-      if (i == 32) {
-         success = false ; /* error -- not found */
-      }
-      return success ; 
-   }
-
-   VARTYPE *ptr[32] ; 
-   int lenType[32] ;
-} ;
-
-}
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshPolicy.hxx b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshPolicy.hxx
deleted file mode 100644
index 0c8a9e42a..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/luleshPolicy.hxx
+++ /dev/null
@@ -1,94 +0,0 @@
-// This work was performed under the auspices of the U.S. Department of Energy by
-// Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
-//
-
-//
-//   Tiling modes for different exeuction cases (see luleshPolicy.hxx).
-//
-enum TilingMode
-{
-   Canonical,       // canonical element ordering -- single range segment
-   Tiled_Index,     // canonical ordering, tiled using unstructured segments
-   Tiled_Order,     // elements permuted, tiled using range segments
-   Tiled_LockFree,  // tiled ordering, lock-free
-   Tiled_LockFreeColor,     // tiled ordering, lock-free, unstructured
-   Tiled_LockFreeColorSIMD  // tiled ordering, lock-free, range
-};
-
-
-// Use cases for RAJA execution patterns:
-
-#define LULESH_SEQUENTIAL       1 /* (possible SIMD vectorization applied) */
-#define LULESH_CANONICAL        2 /*  OMP forall applied to each for loop */
-#define LULESH_CUDA_CANONICAL   9 /*  CUDA launch applied to each loop */
-#define LULESH_STREAM_EXPERIMENTAL 11 /* Work in progress... */
-
-#ifndef USE_CASE
-#define USE_CASE   LULESH_CANONICAL
-#endif
-
-
-
-// ----------------------------------------------------
-#if USE_CASE == LULESH_SEQUENTIAL 
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit              Segment_Iter;
-typedef RAJA::simd_exec              Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::seq_reduce reduce_policy; 
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CANONICAL
-
-// Requires OMP_FINE_SYNC when run in parallel
-#define OMP_FINE_SYNC 1
-
-// AllocateTouch should definitely be used
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit              Segment_Iter;
-typedef RAJA::omp_parallel_for_exec  Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::omp_reduce reduce_policy;
-
-// ----------------------------------------------------
-#elif USE_CASE == LULESH_CUDA_CANONICAL
-
-// Requires OMP_FINE_SYNC 
-#define OMP_FINE_SYNC 1
-
-TilingMode const lulesh_tiling_mode = Canonical;
-
-typedef RAJA::seq_segit         Segment_Iter;
-
-/// Define thread block size for CUDA exec policy
-const size_t thread_block_size = 256;
-typedef RAJA::cuda_exec<thread_block_size>    Segment_Exec;
-
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> mat_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<Segment_Iter, Segment_Exec> symnode_exec_policy;
-
-typedef RAJA::cuda_reduce<thread_block_size> reduce_policy;
-
-// ----------------------------------------------------
-#else
-
-#error "You must define a use case in luleshPolicy.cxx"
-
-#endif
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_ptr.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_ptr.h
deleted file mode 100644
index 8439a9984..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_ptr.h
+++ /dev/null
@@ -1,686 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-typedef Real_t * __restrict__ Real_p ;
-typedef Index_t * __restrict__ Index_p ;
-typedef Int_t * __restrict__ Int_p ;
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline RAJA_DEVICE
-real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline RAJA_DEVICE
-real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline RAJA_DEVICE
-real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline RAJA_DEVICE
-real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline RAJA_DEVICE
-real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline RAJA_DEVICE
-real8  FABS(real8  arg) { return fabs(arg) ; }
-inline RAJA_DEVICE
-real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-#if defined(RAJA_ENABLE_CUDA)
-   void *operator new(size_t size)
-   {
-     void *ptr ;
-     cudaMallocManaged((void **)&ptr, size, cudaMemAttachGlobal) ;
-     return ptr ;
-   }
-
-   void operator delete(void *ptr)
-   {
-     cudaFree(ptr) ;
-   }
-#endif
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Index_t numNode) // Node-centered
-   {
-      m_x = Allocate<Real_t>(numNode) ; // coordinates
-      m_y = Allocate<Real_t>(numNode) ;
-      m_z = Allocate<Real_t>(numNode) ;
-
-      m_xd = Allocate<Real_t>(numNode) ; // velocities
-      m_yd = Allocate<Real_t>(numNode) ;
-      m_zd = Allocate<Real_t>(numNode) ;
-
-      m_xdd = Allocate<Real_t>(numNode) ; // accelerations
-      m_ydd = Allocate<Real_t>(numNode) ;
-      m_zdd = Allocate<Real_t>(numNode) ;
-
-      m_fx = Allocate<Real_t>(numNode) ; // forces
-      m_fy = Allocate<Real_t>(numNode) ;
-      m_fz = Allocate<Real_t>(numNode) ;
-
-      m_nodalMass = Allocate<Real_t>(numNode) ; // mass
-   }
-
-   void AllocateElemPersistent(Index_t numElem) // Elem-centered
-   {
-      m_nodelist = Allocate<Index_t>(8*numElem) ;
-
-      // elem connectivities through face
-      m_lxim = Allocate<Index_t>(numElem) ;
-      m_lxip = Allocate<Index_t>(numElem) ;
-      m_letam = Allocate<Index_t>(numElem) ;
-      m_letap = Allocate<Index_t>(numElem) ;
-      m_lzetam = Allocate<Index_t>(numElem) ;
-      m_lzetap = Allocate<Index_t>(numElem) ;
-
-      m_elemBC = Allocate<Int_t>(numElem) ;
-
-      m_e = Allocate<Real_t>(numElem) ;
-      m_p = Allocate<Real_t>(numElem) ;
-
-      m_q = Allocate<Real_t>(numElem) ;
-      m_ql = Allocate<Real_t>(numElem) ;
-      m_qq = Allocate<Real_t>(numElem) ;
-
-      m_v = Allocate<Real_t>(numElem) ;
-
-      m_volo = Allocate<Real_t>(numElem) ;
-      m_delv = Allocate<Real_t>(numElem) ;
-      m_vdov = Allocate<Real_t>(numElem) ;
-
-      m_arealg = Allocate<Real_t>(numElem) ;
-
-      m_ss = Allocate<Real_t>(numElem) ;
-
-      m_elemMass = Allocate<Real_t>(numElem) ;
-
-      m_vnew = Allocate<Real_t>(numElem) ;
-   }
-
-   void AllocateGradients(RAJA::MemoryPool< Real_t> &pool,
-                          Index_t numElem, Index_t allElem)
-   {
-      // Position gradients
-      m_delx_xi = pool.allocate(numElem) ;
-      m_delx_eta = pool.allocate(numElem) ;
-      m_delx_zeta = pool.allocate(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi = pool.allocate(allElem) ;
-      m_delv_eta = pool.allocate(allElem) ;
-      m_delv_zeta = pool.allocate(allElem) ;
-   }
-
-   void DeallocateGradients(RAJA::MemoryPool< Real_t> &pool)
-   {
-      pool.release(&m_delv_zeta) ;
-      pool.release(&m_delv_eta) ;
-      pool.release(&m_delv_xi) ;
-
-      pool.release(&m_delx_zeta) ;
-      pool.release(&m_delx_eta) ;
-      pool.release(&m_delx_xi) ;
-   }
-
-   void AllocateStrains(RAJA::MemoryPool< Real_t > &pool,
-                        Index_t numElem)
-   {
-      m_dxx = pool.allocate(numElem) ;
-      m_dyy = pool.allocate(numElem) ;
-      m_dzz = pool.allocate(numElem) ;
-   }
-
-   void DeallocateStrains(RAJA::MemoryPool< Real_t > &pool)
-   {
-      pool.release(&m_dzz) ;
-      pool.release(&m_dyy) ;
-      pool.release(&m_dxx) ;
-   }
-
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_p  nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-#if defined(OMP_FINE_SYNC)
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_p nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-#endif
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_p   regNumList()            { return &m_regNumList[0] ; }
-   Index_p   regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx)
-   { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()    { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()    { return m_domElemISet ; }
-   LULESH_ISET& getElemRegISet() { return m_domElemRegISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_p commDataSend ;
-   Real_p commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-   LULESH_ISET m_domElemRegISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-   Real_p m_x ;  /* coordinates */
-   Real_p m_y ;
-   Real_p m_z ;
-
-   Real_p m_xd ; /* velocities */
-   Real_p m_yd ;
-   Real_p m_zd ;
-
-   Real_p m_xdd ; /* accelerations */
-   Real_p m_ydd ;
-   Real_p m_zdd ;
-
-   Real_p m_fx ;  /* forces */
-   Real_p m_fy ;
-   Real_p m_fz ;
-
-   Real_p m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   Index_p  m_nodelist ;     /* elemToNode connectivity */
-
-   Index_p  m_lxim ;  /* element connectivity across each face */
-   Index_p  m_lxip ;
-   Index_p  m_letam ;
-   Index_p  m_letap ;
-   Index_p  m_lzetam ;
-   Index_p  m_lzetap ;
-
-   Int_p    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   Real_p m_dxx ;  /* principal strains -- temporary */
-   Real_p m_dyy ;
-   Real_p m_dzz ;
-
-   Real_p m_delv_xi ;    /* velocity gradient -- temporary */
-   Real_p m_delv_eta ;
-   Real_p m_delv_zeta ;
-
-   Real_p m_delx_xi ;    /* coordinate gradient -- temporary */
-   Real_p m_delx_eta ;
-   Real_p m_delx_zeta ;
-
-   Real_p m_e ;   /* energy */
-
-   Real_p m_p ;   /* pressure */
-   Real_p m_q ;   /* q */
-   Real_p m_ql ;  /* linear term for q */
-   Real_p m_qq ;  /* quadratic term for q */
-
-   Real_p m_v ;     /* relative volume */
-   Real_p m_volo ;  /* reference volume */
-   Real_p m_vnew ;  /* new relative volume -- temporary */
-   Real_p m_delv ;  /* m_vnew - m_v */
-   Real_p m_vdov ;  /* volume derivative over volume */
-
-   Real_p m_arealg ;  /* characteristic length of an element */
-
-   Real_p m_ss ;      /* "sound speed" */
-
-   Real_p m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_p m_regElemSize ;   // Size of region sets
-   Index_p m_regNumList ;    // Region number per domain element
-   Index_p *m_regElemlist ;  // region indexset
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_p m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-#if defined(OMP_FINE_SYNC)
-   Index_p m_nodeElemStart ;
-   Index_p m_nodeElemCornerList ;
-#endif
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_stl.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_stl.h
deleted file mode 100644
index 617a331db..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_stl.h
+++ /dev/null
@@ -1,674 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline RAJA_DEVICE
-real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline RAJA_DEVICE
-real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline RAJA_DEVICE
-real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline RAJA_DEVICE
-real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline RAJA_DEVICE
-real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline RAJA_DEVICE
-real8  FABS(real8  arg) { return fabs(arg) ; }
-inline RAJA_DEVICE
-real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_x.reserve(numNode);  // coordinates
-      m_y.reserve(numNode);
-      m_z.reserve(numNode);
-
-      m_xd.reserve(numNode); // velocities
-      m_yd.reserve(numNode);
-      m_zd.reserve(numNode);
-
-      m_xdd.reserve(numNode); // accelerations
-      m_ydd.reserve(numNode);
-      m_zdd.reserve(numNode);
-
-      m_fx.reserve(numNode);  // forces
-      m_fy.reserve(numNode);
-      m_fz.reserve(numNode);
-
-      m_nodalMass.reserve(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.reserve(8*numElem);
-
-      // elem connectivities through face
-      m_lxim.reserve(numElem);
-      m_lxip.reserve(numElem);
-      m_letam.reserve(numElem);
-      m_letap.reserve(numElem);
-      m_lzetam.reserve(numElem);
-      m_lzetap.reserve(numElem);
-
-      m_elemBC.reserve(numElem);
-
-      m_e.reserve(numElem);
-      m_p.reserve(numElem);
-
-      m_q.reserve(numElem);
-      m_ql.reserve(numElem);
-      m_qq.reserve(numElem);
-
-      m_v.reserve(numElem);
-
-      m_volo.reserve(numElem);
-      m_delv.reserve(numElem);
-      m_vdov.reserve(numElem);
-
-      m_arealg.reserve(numElem);
-
-      m_ss.reserve(numElem);
-
-      m_elemMass.reserve(numElem);
-
-      m_vnew.reserve(numElem) ;
-   }
-
-   void AllocateGradients(RAJA::MemoryPool< Real_t > &pool,
-                          Int_t numElem, Int_t allElem)
-   {
-      (void) pool ;
-
-      // Position gradients
-      m_delx_xi.reserve(numElem) ;
-      m_delx_eta.reserve(numElem) ;
-      m_delx_zeta.reserve(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.reserve(allElem) ;
-      m_delv_eta.reserve(allElem);
-      m_delv_zeta.reserve(allElem) ;
-   }
-
-   void DeallocateGradients(RAJA::MemoryPool< Real_t > &pool)
-   {
-      (void) pool ;
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(RAJA::MemoryPool< Real_t > &pool,
-                        Int_t numElem)
-   {
-      (void) pool ;
-
-      m_dxx.reserve(numElem) ;
-      m_dyy.reserve(numElem) ;
-      m_dzz.reserve(numElem) ;
-   }
-
-   void DeallocateStrains(RAJA::MemoryPool< Real_t > &pool)
-   {
-      (void) pool ;
-
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_t*  nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-#if defined(OMP_FINE_SYNC)
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-#endif
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx)
-   { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()    { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()    { return m_domElemISet ; }
-   LULESH_ISET& getElemRegISet() { return m_domElemRegISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-   LULESH_ISET m_domElemRegISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_t *m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-#if defined(OMP_FINE_SYNC)
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-#endif
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_tuple.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_tuple.h
deleted file mode 100644
index 67ee8ed7e..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-IndexSet/lulesh_tuple.h
+++ /dev/null
@@ -1,667 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline RAJA_DEVICE
-real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline RAJA_DEVICE
-real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline RAJA_DEVICE
-real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline RAJA_DEVICE
-real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline RAJA_DEVICE
-real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline RAJA_DEVICE
-real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline RAJA_DEVICE
-real8  FABS(real8  arg) { return fabs(arg) ; }
-inline RAJA_DEVICE
-real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_coord.reserve(numNode);  // coordinates
-
-      m_vel.reserve(numNode); // velocities
-
-      m_acc.reserve(numNode); // accelerations
-
-      m_force.reserve(numNode);  // forces
-
-      m_nodalMass.reserve(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.reserve(8*numElem);
-
-      // elem connectivities through face
-      m_faceToElem.reserve(numElem);
-
-      m_elemBC.reserve(numElem);
-
-      m_e.reserve(numElem);
-
-      m_pq.reserve(numElem);
-
-      m_qlqq.reserve(numElem);
-
-      m_vol.reserve(numElem);
-
-      m_delv.reserve(numElem);
-      m_vdov.reserve(numElem);
-
-      m_arealg.reserve(numElem);
-
-      m_ss.reserve(numElem);
-
-      m_elemMass.reserve(numElem);
-
-      m_vnew.reserve(numElem) ;
-   }
-
-   void AllocateGradients(RAJA::MemoryPool< Real_t > &pool,
-                          Int_t numElem, Int_t allElem)
-   {
-      (void) pool ;
-
-      // Position gradients
-      m_delx_xi.reserve(numElem) ;
-      m_delx_eta.reserve(numElem) ;
-      m_delx_zeta.reserve(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.reserve(allElem) ;
-      m_delv_eta.reserve(allElem);
-      m_delv_zeta.reserve(allElem) ;
-   }
-
-   void DeallocateGradients(RAJA::MemoryPool< Real_t > &pool)
-   {
-      (void) pool ;
-
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(RAJA::MemoryPool< Real_t > &pool,
-                        Int_t numElem)
-   {
-      (void) pool ;
-
-      m_dxx.reserve(numElem) ;
-      m_dyy.reserve(numElem) ;
-      m_dzz.reserve(numElem) ;
-   }
-
-   void DeallocateStrains(RAJA::MemoryPool< Real_t > &pool)
-   {
-      (void) pool ;
-
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_coord[idx].x ; }
-   Real_t& y(Index_t idx)    { return m_coord[idx].y ; }
-   Real_t& z(Index_t idx)    { return m_coord[idx].z ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_vel[idx].x ; }
-   Real_t& yd(Index_t idx)   { return m_vel[idx].y ; }
-   Real_t& zd(Index_t idx)   { return m_vel[idx].z ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_acc[idx].x ; }
-   Real_t& ydd(Index_t idx)  { return m_acc[idx].y ; }
-   Real_t& zdd(Index_t idx)  { return m_acc[idx].z ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_force[idx].x ; }
-   Real_t& fy(Index_t idx)   { return m_force[idx].y ; }
-   Real_t& fz(Index_t idx)   { return m_force[idx].z ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_faceToElem[idx].lxim ; }
-   Index_t&  lxip(Index_t idx) { return m_faceToElem[idx].lxip ; }
-   Index_t&  letam(Index_t idx) { return m_faceToElem[idx].letam ; }
-   Index_t&  letap(Index_t idx) { return m_faceToElem[idx].letap ; }
-   Index_t&  lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; }
-   Index_t&  lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_pq[idx].p ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_pq[idx].q ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_qlqq[idx].ql ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qlqq[idx].qq ; }
-
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_vol[idx].v ; }
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_vol[idx].volo ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-#if defined(OMP_FINE_SYNC)
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-#endif
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx)
-   { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()    { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()    { return m_domElemISet ; }
-   LULESH_ISET& getElemRegISet() { return m_domElemRegISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-   LULESH_ISET m_domElemRegISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-
-   struct Tuple3 {
-      Real_t x, y, z ;
-   } ;
-
-   std::vector<Tuple3> m_coord ;  /* coordinates */
-
-   std::vector<Tuple3> m_vel ; /* velocities */
-
-   std::vector<Tuple3> m_acc ; /* accelerations */
-
-   std::vector<Tuple3> m_force ;  /* forces */
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   struct FaceElemConn {
-      Index_t lxim, lxip, letam, letap, lzetam, lzetap ;
-   } ;
-
-   std::vector<FaceElemConn> m_faceToElem ; /* element conn across faces */
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   struct Pcomponents {
-      Real_t p, q ;
-   } ;
-
-   std::vector<Pcomponents> m_pq ;   /* pressure and artificial viscosity */
-
-   struct Qcomponents {
-      Real_t ql, qq ;
-   } ;
-
-   std::vector<Qcomponents> m_qlqq ;  /* linear and quadratic terms for q */
-
-   struct Volume {
-      Real_t v, volo ;
-   } ;
-
-   std::vector<Volume> m_vol ;     /* relative and reference volume */
-
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_t *m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-#if defined(OMP_FINE_SYNC)
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-#endif
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-RAJA_HOST_DEVICE
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/CMakeLists.txt b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/CMakeLists.txt
deleted file mode 100644
index 111d10431..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_definitions(-DUSE_MPI=0 -DUSE_OMP=1)
-add_definitions(-DLULESH_HEADER=2)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-if (RAJA_ENABLE_CUDA)
-  cuda_add_executable(lulesh2.0_RAJA-ISetMIC.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0_RAJA-ISetMIC.exe RAJA ${RT_LIBRARIES})
-elseif(RAJA_ENABLE_OPENMP)
-  add_executable(lulesh2.0_RAJA-ISetMIC.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0_RAJA-ISetMIC.exe RAJA ${RT_LIBRARIES})
-endif()
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.keep b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.keep
deleted file mode 100644
index 4e217bf95..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.keep
+++ /dev/null
@@ -1,73 +0,0 @@
-#default build suggestion of MPI + OPENMP with gcc on Livermore machines you might have to change the compiler name
-
-SHELL = /bin/sh
-.SUFFIXES: .cc .o
-
-LULESH_EXEC = lulesh2.0
-
-MPI_INC = /opt/local/include/openmpi
-MPI_LIB = /opt/local/lib
-
-#Common defines
-MY_DEFS = -DUSE_MPI=0 -DUSE_OMP=1
-
-#Build with Intel compiler
-CXX  = /usr/local/tools/ic-14.0.144/bin/icpc 
-
-#Default build with GNU compiler
-#SERCXX = g++ -DUSE_MPI=0
-#MPICXX = mpig++ -DUSE_MPI=1
-#CXX = $(MPICXX)
-
-SOURCES2.0 = \
-	lulesh.cc \
-	lulesh-comm.cc \
-	lulesh-viz.cc \
-	lulesh-util.cc \
-	lulesh-init.cc
-OBJECTS2.0 = $(SOURCES2.0:.cc=.o)
-
-#Default build suggestions with OpenMP for GNU compiler
-#CXXFLAGS = -g -O3 -fopenmp -I. -Wall
-#LDFLAGS = -g -O3 -fopenmp
-
-#Build with OpenMP for Intel compiler
-#CXXFLAGS = -O3 -mavx -inline-max-total-size=20000 -inline-forceinline -ansi-alias -std=c++0x -openmp -static-intel $(MY_DEFS)
-CXXFLAGS = -O3 -msse4.1 -inline-max-total-size=20000 -inline-forceinline -ansi-alias -std=c++0x -openmp -static-intel $(MY_DEFS)
-LDFLAGS = -openmp
-
-#Below are reasonable default flags for a serial build
-#CXXFLAGS = -g -O3 -I. -Wall
-#LDFLAGS = -g -O3 
-
-#common places you might find silo on the Livermore machines.
-#SILO_INCDIR = /opt/local/include
-#SILO_LIBDIR = /opt/local/lib
-#SILO_INCDIR = ./silo/4.9/1.8.10.1/include
-#SILO_LIBDIR = ./silo/4.9/1.8.10.1/lib
-
-#If you do not have silo and visit you can get them at:
-#silo:  https://wci.llnl.gov/codes/silo/downloads.html
-#visit: https://wci.llnl.gov/codes/visit/download.html
-
-#below is and example of how to make with silo, hdf5 to get vizulization by default all this is turned off.  All paths are Livermore specific.
-#CXXFLAGS = -g -DVIZ_MESH -I${SILO_INCDIR} -Wall -Wno-pragmas
-#LDFLAGS = -g -L${SILO_LIBDIR} -Wl,-rpath -Wl,${SILO_LIBDIR} -lsiloh5 -lhdf5
-
-.cc.o: lulesh.h
-	@echo "Building $<"
-	$(CXX) -c $(CXXFLAGS) -o $@  $<
-
-all: $(LULESH_EXEC)
-
-lulesh2.0: $(OBJECTS2.0)
-	@echo "Linking"
-	$(CXX) $(OBJECTS2.0) $(LDFLAGS) -lm -o $@
-
-clean:
-	/bin/rm -f *.o *~ $(OBJECTS) $(LULESH_EXEC)
-	/bin/rm -rf *.dSYM
-
-tar: clean
-	cd .. ; tar cvf lulesh-2.0.tar LULESH-2.0 ; mv lulesh-2.0.tar LULESH-2.0
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.ref b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.ref
deleted file mode 100644
index 0513c8a8b..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/Makefile.ref
+++ /dev/null
@@ -1,107 +0,0 @@
-#  
-# THIS IS NOT OPEN SOURCE OR PUBLIC DOMAIN SOFTWARE
-#
-# See README-RAJA_license.txt for access and distribution restrictions
-#
-
-#
-#  Modify stuff in this Makefile to fit your machine and compiler.
-#
-#  Uncomment ONE AND ONLY ONE opt/debug option and "RAJA_ARCH" 
-#  variable line for the platform you want. 
-#
-# IMPORTANT:  The specific compiler version and options are set in
-#             the file ../compilers.mk.
-#
-
-OPT_DEBUG = opt
-#OPT_DEBUG = debug
-
-##
-## Set option to report basic runtime information.
-##
-##LULESH_TIMER_OPTS = -DRAJA_USE_CYCLE
-#LULESH_TIMER_OPTS = -DRAJA_USE_CLOCK
-LULESH_TIMER_OPTS = -DRAJA_USE_GETTIME
-
-# This is needed for RAJA_USE_GETTIME
-LDTIMER = -lrt
-
-
-#rzalastor
-RAJA_ARCH = x86_sse_icc
-#RAJA_ARCH = x86_sse_gnu
-
-#rzmerl
-#RAJA_ARCH = x86_avx_icc
-#RAJA_ARCH = x86_avx_gnu
-
-#rzuseq
-#RAJA_ARCH = bgq_xlc12
-#RAJA_ARCH = bgq_clang
-#RAJA_ARCH = bgq_gnu
-
-#rzmic
-#RAJA_ARCH = MIC
-
-RAJA_TOPDIR    = ../../..
-RAJA_INC       = $(RAJA_TOPDIR)/includes/
-RAJA_SRC       = $(RAJA_TOPDIR)/sources/
-
-
-##
-## The RAJA_rules.mk file defines macro variables that specify RAJA behavior.
-## To change the rules, the file in the RAJA include directory can be edited
-## or it can be replaced with custom version here.
-##
-include $(RAJA_TOPDIR)/build/RAJA_rules.mk
-
-SILO_INCDIR = /usr/gapps/silo/current/chaos_5_x86_64_ib/include
-SILO_LIBDIR = /usr/gapps/silo/current/chaos_5_x86_64_ib/lib
-
-
-##
-## Options to turn on/off MPI and OpenMP
-##
-LULESH_OPTS = -DUSE_MPI=0 -DUSE_OMP=1 -DVIZ_MESH $(LULESH_TIMER_OPTS)
-
-
-##
-## Include file containing compiler version and options.
-##
-include ../../compilers.mk
-
-
-#CXXFLAGS_BUILD = -DLULESH_LIST_INDEXSET=1 -I. -I../../includes -I$(RAJA_INC) $(CXXFLAGS) $(RAJA_RULES) $(LULESH_OPTS)
-CXXFLAGS_BUILD = -I. -I../../includes -I$(RAJA_INC) -I$(SILO_INCDIR) $(CXXFLAGS) $(RAJA_RULES) $(LULESH_OPTS)
-
-RAJAOBJS := $(patsubst %.cxx,%.o,$(wildcard $(RAJA_SRC)/*.cxx))
-
-LULESHOBJS := $(patsubst %.cc,%.o,$(wildcard ./*.cc))
-
-OBJFILES = $(LULESHOBJS) $(RAJAOBJS)
-
-default: $(OBJFILES)
-	$(CXX_COMPILE) -I${SILO_INCDIR} -DVIZ_MESH $(OBJFILES) $(LDFLAGS) $(LDTIMER) $(LDPATH) ${SILO_LIBDIR}/libsiloh5.so -o lulesh2.0_RAJA-ISet.exe
-
-## Uncomment these lines to generate an executable to run...
-%.o : %.cxx ; $(CXX_COMPILE) -c -o $@ $< $(CXXFLAGS_BUILD) 
-%.o : %.cc ; $(CXX_COMPILE) -c -o $@ $< $(CXXFLAGS_BUILD) 
-
-## The following can be used to generate vectorization report and 
-## assmbly code output for Intel compilers...
-#%.o : %.cxx ; $(CXX_COMPILE) -S -o $@ $< $(CXXFLAGS_BUILD)
-#%.o : %.cxx ; $(CXX_COMPILE) -S -vec-report3 -o $@ $< $(CXXFLAGS_BUILD)
-#%.o : %.cxx ; $(CXX_COMPILE) -S -fcode-asm -vec-report3 -o $@ $< $(CXXFLAGS_BUILD)
-
-## The following can be used to generate vectorization report and 
-## assmbly code output for IBM XLC compilers...
-## See compiler flag options in the appropriate section above.
-#%.o : %.cxx ; $(CXX_COMPILE) -c -o $@ $< $(CXXFLAGS_BUILD)
-
-
-clean-obj: 
-	rm -rf *.o $(RAJA_SRC)/*.o
-
-clean: clean-obj
-	rm -rf *.s *.lst *.exe 
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/README b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/README
deleted file mode 100644
index 8b0f260ba..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/README
+++ /dev/null
@@ -1,53 +0,0 @@
-This is the README for LULESH 2.0
-
-More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php
-
-If you have any questions or problems please contact:
-
-Ian Karlin <karlin1@llnl.gov>
-Jeff Keasler <keasler1@llnl.gov> or
-Rob Neely <neely4@llnl.gov>
-
-Also please send any notable results to Ian Karlin <karlin1@llnl.gov> as we are still evaluating the performance of this code.
-
-*** Notable changes in LULESH 2.0 ***
-
-Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-
-The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative:
-
-Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride
-
-Artificial load imbalances can be easily introduced that could impact parallelization strategies.  
-   * The load balance flag changes region assignment.  Region number is raised to the power entered for assignment probability.  Most likely regions changes with MPI process id.
-   * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple.  The cost of 5% is 10x the entered
- multiple.
-
-MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-
-Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt.
-
-Enabled variable timestep calculation by default (courant condition), which results in an additional reduction.  Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size.  Therefore steps to solution will differ from LULESH 1.0.
-
-Default domain (mesh) size reduced from 45^3 to 30^3
-
-Command line options to allow for numerous test cases without needing to recompile
-
-Performance optimizations and code cleanup uncovered during study of LULESH 1.0
-
-Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement
-
-Possible Future 2.0 minor updates (other changes possible as discovered)
-
-* Different default parameters
-* Minor code performance changes and cleanupS
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-comm.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-comm.cc
deleted file mode 100644
index a30c3ec1c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-comm.cc
+++ /dev/null
@@ -1,1837 +0,0 @@
-#include "lulesh.h"
-
-// If no MPI, then this whole file is stubbed out
-#if USE_MPI
-
-#include <mpi.h>
-#include <string.h>
-
-/* Comm Routines */
-
-#define ALLOW_UNPACKED_PLANE false
-#define ALLOW_UNPACKED_ROW   false
-#define ALLOW_UNPACKED_COL   false
-
-/*
-   There are coherence issues for packing and unpacking message
-   buffers.  Ideally, you would like a lot of threads to 
-   cooperate in the assembly/dissassembly of each message.
-   To do that, each thread should really be operating in a
-   different coherence zone.
-
-   Let's assume we have three fields, f1 through f3, defined on
-   a 61x61x61 cube.  If we want to send the block boundary
-   information for each field to each neighbor processor across
-   each cube face, then we have three cases for the
-   memory layout/coherence of data on each of the six cube
-   boundaries:
-
-      (a) Two of the faces will be in contiguous memory blocks
-      (b) Two of the faces will be comprised of pencils of
-          contiguous memory.
-      (c) Two of the faces will have large strides between
-          every value living on the face.
-
-   How do you pack and unpack this data in buffers to
-   simultaneous achieve the best memory efficiency and
-   the most thread independence?
-
-   Do do you pack field f1 through f3 tighly to reduce message
-   size?  Do you align each field on a cache coherence boundary
-   within the message so that threads can pack and unpack each
-   field independently?  For case (b), do you align each
-   boundary pencil of each field separately?  This increases
-   the message size, but could improve cache coherence so
-   each pencil could be processed independently by a separate
-   thread with no conflicts.
-
-   Also, memory access for case (c) would best be done without
-   going through the cache (the stride is so large it just causes
-   a lot of useless cache evictions).  Is it worth creating
-   a special case version of the packing algorithm that uses
-   non-coherent load/store opcodes?
-*/
-
-/******************************************/
-
-
-/* doRecv flag only works with regular block structure */
-void CommRecv(Domain& domain, int msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.recvRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post receives */
-
-   /* receive data from neighboring domain faces */
-   if (planeMin && doRecv) {
-      /* contiguous memory */
-      int fromRank = myRank - domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (planeMax) {
-      /* contiguous memory */
-      int fromRank = myRank + domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMin && doRecv) {
-      /* semi-contiguous memory */
-      int fromRank = myRank - domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMax) {
-      /* semi-contiguous memory */
-      int fromRank = myRank + domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMin && doRecv) {
-      /* scattered memory */
-      int fromRank = myRank - 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMax) {
-      /* scattered memory */
-      int fromRank = myRank + 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-
-   if (!planeOnly) {
-      /* receive data from domains connected only by an edge */
-      if (rowMin && colMin && doRecv) {
-         int fromRank = myRank - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax) {
-         int fromRank = myRank + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin) {
-         int fromRank = myRank + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax && doRecv) {
-         int fromRank = myRank - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      /* receive data from domains connected only by a corner */
-      if (rowMin && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-}
-
-/******************************************/
-
-void CommSend(Domain& domain, int msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
-{
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   MPI_Status status[26] ;
-   Real_t *destAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.sendRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post sends */
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dy ;
-
-      if (planeMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (planeMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dz ;
-
-      if (rowMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (rowMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dy * dz ;
-
-      if (colMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (colMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-
-   if (!planeOnly) {
-      if (rowMin && colMin) {
-         int toRank = myRank - domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax && doSend) {
-         int toRank = myRank + domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-              destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin && doSend) {
-         int toRank = myRank + domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax) {
-         int toRank = myRank - domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMin && planeMin) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(0) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*dz - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-
-   MPI_Waitall(26, domain.sendRequest, status) ;
-}
-
-/******************************************/
-
-void CommSBN(Domain& domain, int xferFields, Domain_member *fieldData) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* summation order should be from smallest value to largest */
-   /* or we could try out kahan summation! */
-
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
-   if (domain.rowLoc() == 0) {
-      rowMin = 0 ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = 0 ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = 0 ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = 0 ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = 0 ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = 0 ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMin & planeMin) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMin & planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMin) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMin) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMin) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommSyncPosVel(Domain& domain) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   bool doRecv = false ;
-   Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
-   Domain_member fieldData[6] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin && colMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && colMax && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-
-   if (rowMin && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMin && planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommMonoQ(Domain& domain)
-{
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
-   Domain_member fieldData[3] ;
-   Index_t fieldOffset[3] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t dx = domain.sizeX() ;
-   Index_t dy = domain.sizeY() ;
-   Index_t dz = domain.sizeZ() ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   /* point into ghost data area */
-   // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
-   // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
-   // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
-   fieldData[0] = &Domain::delv_xi ;
-   fieldData[1] = &Domain::delv_eta ;
-   fieldData[2] = &Domain::delv_zeta ;
-   fieldOffset[0] = domain.numElem() ;
-   fieldOffset[1] = domain.numElem() ;
-   fieldOffset[2] = domain.numElem() ;
-
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-}
-
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-init.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-init.cc
deleted file mode 100644
index 37bb622b7..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-init.cc
+++ /dev/null
@@ -1,874 +0,0 @@
-#include <math.h>
-#if USE_MPI
-# include <mpi.h>
-#endif
-#if USE_OMP
-#include <omp.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <cstdlib>
-#include "lulesh.h"
-
-/////////////////////////////////////////////////////////////////////
-Domain::Domain(Int_t numRanks, Index_t colLoc,
-               Index_t rowLoc, Index_t planeLoc,
-               Index_t nx, int tp, int nr, int balance, Int_t cost)
-   :
-   m_e_cut(Real_t(1.0e-7)),
-   m_p_cut(Real_t(1.0e-7)),
-   m_q_cut(Real_t(1.0e-7)),
-   m_v_cut(Real_t(1.0e-10)),
-   m_u_cut(Real_t(1.0e-7)),
-   m_hgcoef(Real_t(3.0)),
-   m_ss4o3(Real_t(4.0)/Real_t(3.0)),
-   m_qstop(Real_t(1.0e+12)),
-   m_monoq_max_slope(Real_t(1.0)),
-   m_monoq_limiter_mult(Real_t(2.0)),
-   m_qlc_monoq(Real_t(0.5)),
-   m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
-   m_qqc(Real_t(2.0)),
-   m_eosvmax(Real_t(1.0e+9)),
-   m_eosvmin(Real_t(1.0e-9)),
-   m_pmin(Real_t(0.)),
-   m_emin(Real_t(-1.0e+15)),
-   m_dvovmax(Real_t(0.1)),
-   m_refdens(Real_t(1.0)),
-//
-// set pointers to (potentially) "new'd" arrays to null to 
-// simplify deallocation.
-//
-   m_perm(0),
-   m_regNumList(0),
-   m_nodeElemStart(0),
-   m_nodeElemCornerList(0),
-   m_regElemSize(0),
-   m_regElemlist(0)
-#if USE_MPI
-   , 
-   commDataSend(0),
-   commDataRecv(0)
-#endif
-{
-
-   Index_t edgeElems = nx ;
-   Index_t edgeNodes = edgeElems+1 ;
-   this->cost() = cost;
-
-   m_tp       = tp ;
-   m_numRanks = numRanks ;
-
-   ///////////////////////////////
-   //   Initialize Sedov Mesh
-   ///////////////////////////////
-
-   // construct a uniform box for this processor
-
-   m_colLoc   =   colLoc ;
-   m_rowLoc   =   rowLoc ;
-   m_planeLoc = planeLoc ;
-   
-   m_sizeX = edgeElems ;
-   m_sizeY = edgeElems ;
-   m_sizeZ = edgeElems ;
-   m_numElem = edgeElems*edgeElems*edgeElems ;
-
-   m_numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   m_regNumList = new Index_t[numElem()] ;  // material indexset
-
-#if !defined(LULESH_LIST_INDEXSET)
-   m_perm = new Index_t[numElem()] ;
-#endif
-   // Elem-centered 
-   AllocateElemPersistent(numElem()) ;
-
-   // Node-centered 
-   AllocateNodePersistent(numNode()) ;
-
-   SetupCommBuffers(edgeNodes);
-
-   BuildMeshTopology(edgeNodes, edgeElems);
-
-   BuildMeshCoordinates(nx, edgeNodes);
-
-   // Setup index sets for nodes and elems 
-   CreateMeshIndexSets();
-
-   // Setup symmetry nodesets
-   CreateSymmetryIndexSets(edgeNodes);
-
-   // Setup element connectivities
-   SetupElementConnectivities(edgeElems);
-
-   // Setup symmetry planes and free surface boundary arrays
-   SetupBoundaryConditions(edgeElems);
-
-   // Setup region index sets. For now, these are constant sized
-   // throughout the run, but could be changed every cycle to 
-   // simulate effects of ALE on the lagrange solver
-   CreateRegionIndexSets(nr, balance);
-
-   /* find element zero index */
-   Index_t initEnergyElemIdx = 0 ;
-
-   /* assign each material to a contiguous range of elements */
-   if ((m_perm != 0) && (nr != 1)) {
-      /* permute nodelist connectivity */
-      {
-         Index_t *tmp = new Index_t[8*numElem()] ;
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            Index_t *localNode = &nodelist[8*m_perm[i]] ;
-            for (Index_t j=0; j<8; ++j) {
-               tmp[i*8+j] = localNode[j] ;
-            }
-         } ) ;
-         memcpy(nodelist, tmp, 8*sizeof(Index_t)*numElem()) ;
-         delete [] tmp ;
-      }
-
-      /* permute lxim, lxip, letam, letap, lzetam, lzetap */
-      {
-         Index_t *tmp = new Index_t[6*numElem()] ;
-         Index_t *iperm = new Index_t[numElem()] ; /* inverse permutation */
-
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            iperm[m_perm[i]] = i ;
-         } ) ;
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            tmp[i*6+0] = iperm[lxim[m_perm[i]]] ;
-            tmp[i*6+1] = iperm[lxip[m_perm[i]]] ;
-            tmp[i*6+2] = iperm[letam[m_perm[i]]] ;
-            tmp[i*6+3] = iperm[letap[m_perm[i]]] ;
-            tmp[i*6+4] = iperm[lzetam[m_perm[i]]] ;
-            tmp[i*6+5] = iperm[lzetap[m_perm[i]]] ;
-         } ) ;
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            lxim[i] = tmp[i*6+0] ;
-            lxip[i] = tmp[i*6+1] ;
-            letam[i] = tmp[i*6+2] ;
-            letap[i] = tmp[i*6+3] ;
-            lzetam[i] = tmp[i*6+4] ;
-            lzetap[i] = tmp[i*6+5] ;
-         } ) ;
-
-         initEnergyElemIdx = iperm[0] ;
-
-         delete [] iperm ;
-         delete [] tmp ;
-      }
-      /* permute elemBC */
-      {
-         Int_t *tmp = new Int_t[numElem()] ;
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            tmp[i] = elemBC[m_perm[i]] ;
-         } ) ;
-         RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-            elemBC[i] = tmp[i] ;
-         } ) ;
-         delete [] tmp ;
-      }
-   }
-
-   // Basic Field Initialization 
-   RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-      e[i] =  Real_t(0.0) ;
-      p[i] =  Real_t(0.0) ;
-      q[i] =  Real_t(0.0) ;
-      ss[i] = Real_t(0.0) ;
-   } ) ;
-
-   // Note - v initializes to 1.0, not 0.0!
-   RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-      v[i] = Real_t(1.0) ;
-   } ) ;
-
-   RAJA::forall<node_exec_policy>(getNodeISet(), [&] (int i) {
-      xd[i] = Real_t(0.0) ;
-      yd[i] = Real_t(0.0) ;
-      zd[i] = Real_t(0.0) ;
-   } ) ;
-
-   RAJA::forall<node_exec_policy>(getNodeISet(), [&] (int i) {
-      xdd[i] = Real_t(0.0) ;
-      ydd[i] = Real_t(0.0) ;
-      zdd[i] = Real_t(0.0) ;
-   } ) ;
-
-   RAJA::forall<node_exec_policy>(getNodeISet(), [&] (int i) {
-      nodalMass[i] = Real_t(0.0) ;
-   } ) ;
-
-#if USE_OMP
-   SetupThreadSupportStructures();
-#endif
-
-
-   // Setup defaults
-
-   // These can be changed (requires recompile) if you want to run
-   // with a fixed timestep, or to a different end time, but it's
-   // probably easier/better to just run a fixed number of timesteps
-   // using the -i flag in 2.x
-
-   dtfixed() = Real_t(-1.0e-6) ; // Negative means use courant condition
-   stoptime()  = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ;
-
-   // Initial conditions
-   deltatimemultlb() = Real_t(1.1) ;
-   deltatimemultub() = Real_t(1.2) ;
-   dtcourant() = Real_t(1.0e+20) ;
-   dthydro()   = Real_t(1.0e+20) ;
-   dtmax()     = Real_t(1.0e-2) ;
-   time()    = Real_t(0.) ;
-   cycle()   = Int_t(0) ;
-
-   // initialize field data 
-   RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = &nodelist[8*i] ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = x[gnode];
-        y_local[lnode] = y[gnode];
-        z_local[lnode] = z[gnode];
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      volo[i] = volume ;
-      elemMass[i] = volume ;
-   } ) ;
-
-   /* RAJA is not thread-safe here -- address when more policies defined */
-   // RAJA::forall<elem_exec_policy>(getElemISet(), [&] (int i) {
-   for (Index_t i=0; i<numElem(); ++i) {
-      Index_t *elemToNode = &nodelist[8*i] ;
-      Real_t cornerMass = elemMass[i] / Real_t(8.0) ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         nodalMass[idx] += cornerMass ;
-      }
-   } // ) ;
-
-   // deposit initial energy
-   // An energy of 3.948746e+7 is correct for a problem with
-   // 45 zones along a side - we need to scale it
-   const Real_t ebase = Real_t(3.948746e+7);
-   Real_t scale = (nx*m_tp)/Real_t(45.0);
-   Real_t einit = ebase*scale*scale*scale;
-   if (m_rowLoc + m_colLoc + m_planeLoc == 0) {
-      // Dump into the first zone (which we know is in the corner)
-      // of the domain that sits at the origin
-      e[initEnergyElemIdx] = einit;
-   }
-   //set initial deltatime base on analytic CFL calculation
-   deltatime() = (Real_t(.5)*cbrt(volo[0]))/sqrt(Real_t(2.0)*einit);
-
-} // End constructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-Domain::~Domain()
-{
-   delete [] m_regNumList;
-   delete [] m_nodeElemStart;
-   delete [] m_nodeElemCornerList;
-   delete [] m_regElemSize;
-   if (numReg() != 1) {
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-        delete [] m_regElemlist[i];
-      }
-   }
-   delete [] m_regElemlist;
-   
-   if (m_perm != 0) {
-      delete [] m_perm ;
-   }
-#if USE_MPI
-   delete [] commDataSend;
-   delete [] commDataRecv;
-#endif
-} // End destructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems)
-{
-  // embed hexehedral elements in nodal point lattice 
-  Index_t zidx = 0 ;
-  Index_t nidx = 0 ;
-  for (Index_t plane=0; plane<edgeElems; ++plane) {
-    for (Index_t row=0; row<edgeElems; ++row) {
-      for (Index_t col=0; col<edgeElems; ++col) {
-        Index_t *localNode = &nodelist[8*zidx] ;
-        localNode[0] = nidx                                       ;
-        localNode[1] = nidx                                   + 1 ;
-        localNode[2] = nidx                       + edgeNodes + 1 ;
-        localNode[3] = nidx                       + edgeNodes     ;
-        localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-        localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-        localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-        localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-        ++zidx ;
-        ++nidx ;
-      }
-      ++nidx ;
-    }
-    nidx += edgeNodes ;
-  }
-}
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMeshCoordinates(Index_t nx, Index_t edgeNodes)
-{
-  Index_t meshEdgeElems = m_tp*nx ;
-
-  // initialize nodal coordinates 
-  Index_t nidx = 0 ;
-  Real_t tz = Real_t(1.125)*Real_t(m_planeLoc*nx)/Real_t(meshEdgeElems) ;
-  for (Index_t plane=0; plane<edgeNodes; ++plane) {
-    Real_t ty = Real_t(1.125)*Real_t(m_rowLoc*nx)/Real_t(meshEdgeElems) ;
-    for (Index_t row=0; row<edgeNodes; ++row) {
-      Real_t tx = Real_t(1.125)*Real_t(m_colLoc*nx)/Real_t(meshEdgeElems) ;
-      for (Index_t col=0; col<edgeNodes; ++col) {
-        x[nidx] = tx ;
-        y[nidx] = ty ;
-        z[nidx] = tz ;
-        ++nidx ;
-        // tx += ds ; // may accumulate roundoff... 
-        tx = Real_t(1.125)*Real_t(m_colLoc*nx+col+1)/Real_t(meshEdgeElems) ;
-      }
-      // ty += ds ;  // may accumulate roundoff... 
-      ty = Real_t(1.125)*Real_t(m_rowLoc*nx+row+1)/Real_t(meshEdgeElems) ;
-    }
-    // tz += ds ;  // may accumulate roundoff... 
-    tz = Real_t(1.125)*Real_t(m_planeLoc*nx+plane+1)/Real_t(meshEdgeElems) ;
-  }
-
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupThreadSupportStructures()
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-  if (numthreads > 1) {
-    // set up node-centered indexing of elements 
-    Index_t *nodeElemCount = new Index_t[numNode()] ;
-
-    for (Index_t i=0; i<numNode(); ++i) {
-      nodeElemCount[i] = 0 ;
-    }
-
-    for (Index_t i=0; i<numElem(); ++i) {
-      Index_t *nl = &nodelist[8*i] ;
-      for (Index_t j=0; j < 8; ++j) {
-        ++(nodeElemCount[nl[j]] );
-      }
-    }
-
-    m_nodeElemStart = new Index_t[numNode()+1] ;
-
-    m_nodeElemStart[0] = 0;
-
-    for (Index_t i=1; i <= numNode(); ++i) {
-      m_nodeElemStart[i] =
-        m_nodeElemStart[i-1] + nodeElemCount[i-1] ;
-    }
-       
-    m_nodeElemCornerList = new Index_t[m_nodeElemStart[numNode()]];
-
-    for (Index_t i=0; i < numNode(); ++i) {
-      nodeElemCount[i] = 0;
-    }
-
-    for (Index_t i=0; i < numElem(); ++i) {
-      Index_t *nl = &nodelist[8*i] ;
-      for (Index_t j=0; j < 8; ++j) {
-        Index_t m = nl[j];
-        Index_t k = i*8 + j ;
-        Index_t offset = m_nodeElemStart[m] + nodeElemCount[m] ;
-        m_nodeElemCornerList[offset] = k;
-        ++(nodeElemCount[m]) ;
-      }
-    }
-
-    Index_t clSize = m_nodeElemStart[numNode()] ;
-    for (Index_t i=0; i < clSize; ++i) {
-      Index_t clv = m_nodeElemCornerList[i] ;
-      if ((clv < 0) || (clv > numElem()*8)) {
-        fprintf(stderr,
-                "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-#if USE_MPI
-        MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-        exit(-1);
-#endif
-      }
-    }
-
-    delete [] nodeElemCount ;
-  }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupCommBuffers(Index_t edgeNodes)
-{
-  // allocate a buffer large enough for nodal ghost data 
-  Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ;
-  m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ;
-  m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ;
-
-  // assume communication to 6 neighbors by default 
-  m_rowMin = (m_rowLoc == 0)        ? 0 : 1;
-  m_rowMax = (m_rowLoc == m_tp-1)     ? 0 : 1;
-  m_colMin = (m_colLoc == 0)        ? 0 : 1;
-  m_colMax = (m_colLoc == m_tp-1)     ? 0 : 1;
-  m_planeMin = (m_planeLoc == 0)    ? 0 : 1;
-  m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1;
-
-#if USE_MPI   
-  // account for face communication 
-  Index_t comBufSize =
-    (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
-    m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for edge communication 
-  comBufSize +=
-    ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
-     (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
-     (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
-     (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
-    m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for corner communication 
-  // factor of 16 is so each buffer has its own cache line 
-  comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
-                 (m_rowMin & m_colMin & m_planeMax) +
-                 (m_rowMin & m_colMax & m_planeMin) +
-                 (m_rowMin & m_colMax & m_planeMax) +
-                 (m_rowMax & m_colMin & m_planeMin) +
-                 (m_rowMax & m_colMin & m_planeMax) +
-                 (m_rowMax & m_colMax & m_planeMin) +
-                 (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
-
-  this->commDataSend = new Real_t[comBufSize] ;
-  this->commDataRecv = new Real_t[comBufSize] ;
-  // prevent floating point exceptions 
-  memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ;
-  memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ;
-#endif   
-
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateMeshIndexSets()
-{
-   // leave nodes and elems in canonical ordering for now...
-   m_domNodeISet.push_back( RAJA::RangeSegment(0, numNode()) );   
-   m_domElemISet.push_back( RAJA::RangeSegment(0, numElem()) );
-}
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateRegionIndexSets(Int_t nr, Int_t balance)
-{
-#if USE_MPI   
-   Index_t myRank;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-   srand(myRank);
-#else
-   srand(0);
-   Index_t myRank = 0;
-#endif
-   this->numReg() = nr;
-   m_regElemSize = new Index_t[numReg()];
-   m_regElemlist = new Index_t*[numReg()];
-   Index_t nextIndex = 0;
-   //if we only have one region just fill it
-   // Fill out the regNumList with material numbers, which are always
-   // the region index plus one 
-   if(numReg() == 1) {
-      while (nextIndex < numElem()) {
-         this->regNumList(nextIndex) = 1;
-         nextIndex++;
-      }
-      regElemSize(0) = numElem();
-      m_domRegISet.resize(numReg());
-      m_domRegISet[0].push_back( RAJA::RangeSegment(0, regElemSize(0)) ) ;
-#if !defined(LULESH_LIST_INDEXSET)
-      for (int i=0; i<numElem(); ++i) {
-         m_perm[i] = i ;
-      }
-#endif
-   }
-   //If we have more than one region distribute the elements.
-   else {
-      Int_t regionNum;
-      Int_t regionVar;
-      Int_t lastReg = -1;
-      Int_t binSize;
-      Index_t elements;
-      Index_t runto = 0;
-      Int_t costDenominator = 0;
-      Int_t* regBinEnd = new Int_t[numReg()];
-      //Determine the relative weights of all the regions.  This is based off the -b flag.  Balance is the value passed into b.  
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         regElemSize(i) = 0;
-         costDenominator += pow((i+1), balance);  //Total sum of all regions weights
-         regBinEnd[i] = costDenominator;  //Chance of hitting a given region is (regBinEnd[i] - regBinEdn[i-1])/costDenominator
-      }
-      //Until all elements are assigned
-      while (nextIndex < numElem()) {
-         //pick the region
-         regionVar = rand() % costDenominator;
-         Index_t i = 0;
-         while(regionVar >= regBinEnd[i])
-            i++;
-         //rotate the regions based on MPI rank.  Rotation is Rank % NumRegions this makes each domain have a different region with 
-         //the highest representation
-         regionNum = ((i + myRank) % numReg()) + 1;
-         // make sure we don't pick the same region twice in a row
-         while(regionNum == lastReg) {
-            regionVar = rand() % costDenominator;
-            i = 0;
-            while(regionVar >= regBinEnd[i])
-               i++;
-            regionNum = ((i + myRank) % numReg()) + 1;
-         }
-         //Pick the bin size of the region and determine the number of elements.
-         binSize = rand() % 1000;
-         if(binSize < 773) {
-           elements = rand() % 15 + 1;
-         }
-         else if(binSize < 937) {
-           elements = rand() % 16 + 16;
-         }
-         else if(binSize < 970) {
-           elements = rand() % 32 + 32;
-         }
-         else if(binSize < 974) {
-           elements = rand() % 64 + 64;
-         } 
-         else if(binSize < 978) {
-           elements = rand() % 128 + 128;
-         }
-         else if(binSize < 981) {
-           elements = rand() % 256 + 256;
-         }
-         else
-            elements = rand() % 1537 + 512;
-         runto = elements + nextIndex;
-         //Store the elements.  If we hit the end before we run out of elements then just stop.
-         while (nextIndex < runto && nextIndex < numElem()) {
-            this->regNumList(nextIndex) = regionNum;
-            nextIndex++;
-         }
-         lastReg = regionNum;
-      } 
-
-      delete [] regBinEnd;
-
-      // Convert regNumList to region index sets
-      // First, count size of each region 
-      for (Index_t i=0 ; i<numElem() ; ++i) {
-         int r = this->regNumList(i)-1; // region index == regnum-1
-         regElemSize(r)++;
-      }
-      // Second, allocate each region index set
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         m_regElemlist[i] = new Index_t[regElemSize(i)];
-         regElemSize(i) = 0;
-      }
-      // Third, fill index sets
-      for (Index_t i=0 ; i<numElem() ; ++i) {
-         Index_t r = regNumList(i)-1;       // region index == regnum-1
-         Index_t regndx = regElemSize(r)++; // Note increment
-         regElemlist(r,regndx) = i;
-      }
-
-      // Create HybridISets for regions
-      m_domRegISet.resize(numReg());
-      int elemCount = 0 ;
-      for (int r = 0; r < numReg(); ++r) {
-#if !defined(LULESH_LIST_INDEXSET)
-         memcpy( &m_perm[elemCount], regElemlist(r), sizeof(Index_t)*regElemSize(r) ) ;
-         m_domRegISet[r].push_back( RAJA::RangeSegment(elemCount, elemCount+regElemSize(r)) );
-         elemCount += regElemSize(r) ;
-#else
-         m_domRegISet[r].push_back( RAJA::ListSegment(regElemlist(r), regElemSize(r)) );
-#endif
-      }
-
-#if 0 // Check correctness of index sets
-      for (int r = 0; r < numReg(); ++r) {
-         bool good = true;
-         if ( regElemSize(r) != m_domRegISet[r].getLength() ) good = false;
-         if (good) {
-            Index_t* regList = regElemlist(r);
-            int i = 0; 
-            RAJA::forall< LULESH_ISET::ExecPolicy<RAJA::seq_segit, RAJA::seq_exec> >(m_domRegISet[r], [&] (int idx) { 
-               good &= (idx == regList[i]);
-               i++;
-            } );
-         }
-         printf("\nRegion %d index set is %s\n", r, (good ? "GOOD" : "BAD")); 
-      }
-#endif
-   }
-   
-}
-
-/////////////////////////////////////////////////////////////
-void 
-Domain::CreateSymmetryIndexSets(Index_t edgeNodes)
-{
-  if (m_planeLoc == 0) {
-    m_domZSymNodeISet.push_back( RAJA::RangeSegment(0, edgeNodes*edgeNodes) );
-  }
-  if (m_rowLoc == 0) {
-    Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-    Index_t nidx = 0 ;
-    for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-        nset[nidx++] = planeInc + j ;
-      }
-    }
-    m_domYSymNodeISet.push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-    delete [] nset ;
-  }
-  if (m_colLoc == 0) {
-    Index_t *nset = new Index_t[edgeNodes*edgeNodes] ;
-    Index_t nidx = 0 ;
-    for (Index_t i=0; i<edgeNodes; ++i) {
-      Index_t planeInc = i*edgeNodes*edgeNodes ;
-      for (Index_t j=0; j<edgeNodes; ++j) {
-        nset[nidx++] = planeInc + j*edgeNodes ;
-      }
-    }
-    m_domXSymNodeISet.push_back( RAJA::ListSegment(nset, edgeNodes*edgeNodes) );
-    delete [] nset ;
-  }
-}
-
-
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupElementConnectivities(Index_t edgeElems)
-{
-   lxim[0] = 0 ;
-   for (Index_t i=1; i<numElem(); ++i) {
-      lxim[i]   = i-1 ;
-      lxip[i-1] = i ;
-   }
-   lxip[numElem()-1] = numElem()-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      letam[i] = i ; 
-      letap[numElem()-edgeElems+i] = numElem()-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<numElem(); ++i) {
-      letam[i] = i-edgeElems ;
-      letap[i-edgeElems] = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      lzetam[i] = i ;
-      lzetap[numElem()-edgeElems*edgeElems+i] = numElem()-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<numElem(); ++i) {
-      lzetam[i] = i - edgeElems*edgeElems ;
-      lzetap[i-edgeElems*edgeElems] = i ;
-   }
-}
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupBoundaryConditions(Index_t edgeElems) 
-{
-  Index_t ghostIdx[6] ;  // offsets to ghost locations
-
-  // set up boundary condition information
-  for (Index_t i=0; i<numElem(); ++i) {
-     elemBC[i] = Int_t(0) ;
-  }
-
-  for (Index_t i=0; i<6; ++i) {
-    ghostIdx[i] = INT_MIN ;
-  }
-
-  Int_t pidx = numElem() ;
-  if (m_planeMin != 0) {
-    ghostIdx[0] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_planeMax != 0) {
-    ghostIdx[1] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_rowMin != 0) {
-    ghostIdx[2] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_rowMax != 0) {
-    ghostIdx[3] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_colMin != 0) {
-    ghostIdx[4] = pidx ;
-    pidx += sizeY()*sizeZ() ;
-  }
-
-  if (m_colMax != 0) {
-    ghostIdx[5] = pidx ;
-  }
-
-  // symmetry plane or free surface BCs 
-  for (Index_t i=0; i<edgeElems; ++i) {
-    Index_t planeInc = i*edgeElems*edgeElems ;
-    Index_t rowInc   = i*edgeElems ;
-    for (Index_t j=0; j<edgeElems; ++j) {
-      if (m_planeLoc == 0) {
-        elemBC[rowInc+j] |= ZETA_M_SYMM ;
-      }
-      else {
-        elemBC[rowInc+j] |= ZETA_M_COMM ;
-        lzetam[rowInc+j] = ghostIdx[0] + rowInc + j ;
-      }
-
-      if (m_planeLoc == m_tp-1) {
-        elemBC[rowInc+j+numElem()-edgeElems*edgeElems] |=
-          ZETA_P_FREE;
-      }
-      else {
-        elemBC[rowInc+j+numElem()-edgeElems*edgeElems] |=
-          ZETA_P_COMM ;
-        lzetap[rowInc+j+numElem()-edgeElems*edgeElems] =
-          ghostIdx[1] + rowInc + j ;
-      }
-
-      if (m_rowLoc == 0) {
-        elemBC[planeInc+j] |= ETA_M_SYMM ;
-      }
-      else {
-        elemBC[planeInc+j] |= ETA_M_COMM ;
-        letam[planeInc+j] = ghostIdx[2] + rowInc + j ;
-      }
-
-      if (m_rowLoc == m_tp-1) {
-        elemBC[planeInc+j+edgeElems*edgeElems-edgeElems] |= 
-          ETA_P_FREE ;
-      }
-      else {
-        elemBC[planeInc+j+edgeElems*edgeElems-edgeElems] |= 
-          ETA_P_COMM ;
-        letap[planeInc+j+edgeElems*edgeElems-edgeElems] =
-          ghostIdx[3] +  rowInc + j ;
-      }
-
-      if (m_colLoc == 0) {
-        elemBC[planeInc+j*edgeElems] |= XI_M_SYMM ;
-      }
-      else {
-        elemBC[planeInc+j*edgeElems] |= XI_M_COMM ;
-        lxim[planeInc+j*edgeElems] = ghostIdx[4] + rowInc + j ;
-      }
-
-      if (m_colLoc == m_tp-1) {
-        elemBC[planeInc+j*edgeElems+edgeElems-1] |= XI_P_FREE ;
-      }
-      else {
-        elemBC[planeInc+j*edgeElems+edgeElems-1] |= XI_P_COMM ;
-        lxip[planeInc+j*edgeElems+edgeElems-1] =
-          ghostIdx[5] + rowInc + j ;
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side)
-{
-   Int_t testProcs;
-   Int_t dx, dy, dz;
-   Int_t myDom;
-   
-   // Assume cube processor layout for now 
-   testProcs = Int_t(cbrt(Real_t(numRanks))+0.5) ;
-   if (testProcs*testProcs*testProcs != numRanks) {
-      printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (sizeof(Real_t) != 4 && sizeof(Real_t) != 8) {
-      printf("MPI operations only support float and double right now...\n");
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) {
-      printf("corner element comm buffers too small.  Fix code.\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-
-   dx = testProcs ;
-   dy = testProcs ;
-   dz = testProcs ;
-
-   // temporary test
-   if (dx*dy*dz != numRanks) {
-      printf("error -- must have as many domains as procs\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   Int_t remainder = dx*dy*dz % numRanks ;
-   if (myRank < remainder) {
-      myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ;
-   }
-   else {
-      myDom = remainder*( 1+ (dx*dy*dz / numRanks)) +
-         (myRank - remainder)*(dx*dy*dz/numRanks) ;
-   }
-
-   *col = myDom % dx ;
-   *row = (myDom / dx) % dy ;
-   *plane = myDom / (dx*dy) ;
-   *side = testProcs;
-
-   return;
-}
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-util.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-util.cc
deleted file mode 100644
index 85c02fcfd..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-util.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#if USE_MPI
-#include <mpi.h>
-#endif
-#include "lulesh.h"
-
-/* Helper function for converting strings to ints, with error checking */
-int StrToInt(const char *token, int *retVal)
-{
-   const char *c ;
-   char *endptr ;
-   const int decimal_base = 10 ;
-
-   if (token == NULL)
-      return 0 ;
-   
-   c = token ;
-   *retVal = (int)strtol(c, &endptr, decimal_base) ;
-   if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0')))
-      return 1 ;
-   else
-      return 0 ;
-}
-
-static void PrintCommandLineOptions(char *execname, int myRank)
-{
-   if (myRank == 0) {
-
-      printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-   }
-}
-
-static void ParseError(const char *message, int myRank)
-{
-   if (myRank == 0) {
-      printf("%s\n", message);
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-      exit(-1);
-#endif
-   }
-}
-
-void ParseCommandLineOptions(int argc, char *argv[],
-                             int myRank, struct cmdLineOpts *opts)
-{
-   if(argc > 1) {
-      int i = 1;
-
-      while(i < argc) {
-         int ok;
-         /* -i <iterations> */
-         if(strcmp(argv[i], "-i") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -i", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->its));
-            if(!ok) {
-               ParseError("Parse Error on option -i integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -s <size, sidelength> */
-         else if(strcmp(argv[i], "-s") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -s\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->nx));
-            if(!ok) {
-               ParseError("Parse Error on option -s integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -r <numregions> */
-         else if (strcmp(argv[i], "-r") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -r\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numReg));
-            if (!ok) {
-               ParseError("Parse Error on option -r integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -f <numfilepieces> */
-         else if (strcmp(argv[i], "-f") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -f\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numFiles));
-            if (!ok) {
-               ParseError("Parse Error on option -f integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -p */
-         else if (strcmp(argv[i], "-p") == 0) {
-            opts->showProg = 1;
-            i++;
-         }
-         /* -q */
-         else if (strcmp(argv[i], "-q") == 0) {
-            opts->quiet = 1;
-            i++;
-         }
-         else if (strcmp(argv[i], "-b") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -b\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->balance));
-            if (!ok) {
-               ParseError("Parse Error on option -b integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         else if (strcmp(argv[i], "-c") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -c\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->cost));
-            if (!ok) {
-               ParseError("Parse Error on option -c integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -v */
-         else if (strcmp(argv[i], "-v") == 0) {
-#if VIZ_MESH            
-            opts->viz = 1;
-#else
-            ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank);
-#endif
-            i++;
-         }
-         /* -h */
-         else if (strcmp(argv[i], "-h") == 0) {
-            PrintCommandLineOptions(argv[0], myRank);
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, 0);
-#else
-            exit(0);
-#endif
-         }
-         else {
-            char msg[80];
-            PrintCommandLineOptions(argv[0], myRank);
-            sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]);
-            ParseError(msg, myRank);
-         }
-      }
-   }
-}
-
-/////////////////////////////////////////////////////////////////////
-
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks)
-{
-   // GrindTime1 only takes a single domain into account, and is thus a good way to measure
-   // processor speed indepdendent of MPI parallelism.
-   // GrindTime2 takes into account speedups from MPI parallelism 
-   Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx);
-   Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx*numRanks);
-
-   Index_t ElemId = 0;
-   printf("Run completed:  \n");
-   printf("   Problem size        =  %i \n",    nx);
-   printf("   MPI tasks           =  %i \n",    numRanks);
-   printf("   Iteration count     =  %i \n",    locDom.cycle());
-   printf("   Final Origin Energy = %12.6e \n", locDom.e[ElemId]);
-
-   Real_t   MaxAbsDiff = Real_t(0.0);
-   Real_t TotalAbsDiff = Real_t(0.0);
-   Real_t   MaxRelDiff = Real_t(0.0);
-
-   Index_t *iperm = new Index_t[locDom.numElem()] ;
-
-   for (Index_t i=0; i<locDom.numElem(); ++i) {
-      iperm[locDom.m_perm[i]] = i ;
-   }
-
-   for (Index_t j=0; j<nx; ++j) {
-      for (Index_t k=j+1; k<nx; ++k) {
-         Real_t AbsDiff = FABS(locDom.e[iperm[j*nx+k]]-locDom.e[iperm[k*nx+j]]);
-         TotalAbsDiff  += AbsDiff;
-
-         if (MaxAbsDiff <AbsDiff) {
-            MaxAbsDiff = AbsDiff;
-         }
-
-         if (locDom.e[iperm[k*nx+j]] != 0.0) {
-            Real_t RelDiff = AbsDiff / locDom.e[iperm[k*nx+j]];
-            if (MaxRelDiff <RelDiff) {
-               MaxRelDiff = RelDiff;
-            }
-         }
-      }
-   }
-
-   delete [] iperm ;
-
-   // Quick symmetry check
-   printf("   Testing Plane 0 of Energy Array on rank 0:\n");
-   printf("        MaxAbsDiff   = %12.6e\n",   MaxAbsDiff   );
-   printf("        TotalAbsDiff = %12.6e\n",   TotalAbsDiff );
-   printf("        MaxRelDiff   = %12.6e\n\n", MaxRelDiff   );
-
-   // Timing information
-   printf("\nElapsed time         = %10.2f (s)\n", elapsed_time);
-   printf("Grind time (us/z/c)  = %10.8g (per dom)  (%10.8g overall)\n", grindTime1, grindTime2);
-   printf("FOM                  = %10.8g (z/s)\n\n", 1000.0/grindTime2); // zones per second
-
-   return ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-viz.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-viz.cc
deleted file mode 100644
index 6c41259d5..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh-viz.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "lulesh.h"
-
-#if defined(VIZ_MESH)
-
-#ifdef __cplusplus
-  extern "C" {
-#endif
-#include "silo.h"
-#if USE_MPI
-# include "pmpio.h"
-#endif
-#ifdef __cplusplus
-  }
-#endif
-
-// Function prototypes
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank);
-static
-
-
-#if USE_MPI
-// For some reason, earlier versions of g++ (e.g. 4.2) won't let me
-// put the 'static' qualifier on this prototype, even if it's done
-// consistently in the prototype and definition
-void
-DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                      char basename[], int numRanks);
-
-// Callback prototypes for PMPIO interface (only useful if we're
-// running parallel)
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata);
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata);
-static void
-LULESH_PMPIO_Close(void *file, void *udata);
-
-#else
-void
-DumpMultiblockObjects(DBfile *db, char basename[], int numRanks);
-#endif
-
-
-/**********************************************************************/
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 
-{
-  char subdirName[32];
-  char basename[32];
-  DBfile *db;
-
-
-  sprintf(basename, "lulesh_plot_c%d", domain.cycle());
-  sprintf(subdirName, "data_%d", myRank);
-
-#if USE_MPI
-
-  PMPIO_baton_t *bat = PMPIO_Init(numFiles,
-				  PMPIO_WRITE,
-				  MPI_COMM_WORLD,
-				  10101,
-				  LULESH_PMPIO_Create,
-				  LULESH_PMPIO_Open,
-				  LULESH_PMPIO_Close,
-				  NULL);
-
-  int myiorank = PMPIO_GroupRank(bat, myRank);
-
-  char fileName[64];
-  
-  if (myiorank == 0) 
-    strcpy(fileName, basename);
-  else
-    sprintf(fileName, "%s.%03d", basename, myiorank);
-
-  db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName);
-
-  DumpDomainToVisit(db, domain, myRank);
-
-  // Processor 0 writes out bit of extra data to its file that
-  // describes how to stitch all the pieces together
-  if (myRank == 0) {
-    DumpMultiblockObjects(db, bat, basename, numRanks);
-  }
-
-  PMPIO_HandOffBaton(bat, db);
-
-  PMPIO_Finish(bat);
-#else
-
-  db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-  if (db) {
-     DBMkDir(db, subdirName);
-     DBSetDir(db, subdirName);
-     DumpDomainToVisit(db, domain, myRank);
-     DumpMultiblockObjects(db, basename, numRanks);
-  }
-  else {
-     printf("Error writing out viz file - rank %d\n", myRank);
-  }
-
-#endif
-}
-
-
-
-/**********************************************************************/
-
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank)
-{
-   int ok = 0;
-   
-   /* Create an option list that will give some hints to VisIt for
-    * printing out the cycle and time in the annotations */
-   DBoptlist *optlist;
-
-
-   /* Write out the mesh connectivity in fully unstructured format */
-   int shapetype[1] = {DB_ZONETYPE_HEX};
-   int shapesize[1] = {8};
-   int shapecnt[1] = {domain.numElem()};
-   int *conn = new int[domain.numElem()*8] ;
-   int ci = 0 ;
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      Index_t *elemToNode = &domain.nodelist[8*ei] ;
-      for (int ni=0; ni < 8; ++ni) {
-         conn[ci++] = elemToNode[ni] ;
-      }
-   }
-   ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3,
-                        conn, domain.numElem()*8,
-                        0,0,0, /* Not carrying ghost zones */
-                        shapetype, shapesize, shapecnt,
-                        1, NULL);
-   delete [] conn ;
-
-   /* Write out the mesh coordinates associated with the mesh */
-   const char* coordnames[3] = {"X", "Y", "Z"};
-   float *coords[3] ;
-   coords[0] = new float[domain.numNode()] ;
-   coords[1] = new float[domain.numNode()] ;
-   coords[2] = new float[domain.numNode()] ;
-   for (int ni=0; ni < domain.numNode() ; ++ni) {
-      coords[0][ni] = float(domain.x[ni]) ;
-      coords[1][ni] = float(domain.y[ni]) ;
-      coords[2][ni] = float(domain.z[ni]) ;
-   }
-   optlist = DBMakeOptlist(2);
-   ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time());
-   ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle());
-   ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords,
-                      domain.numNode(), domain.numElem(), "connectivity",
-                      0, DB_FLOAT, optlist);
-   ok += DBFreeOptlist(optlist);
-   delete [] coords[2] ;
-   delete [] coords[1] ;
-   delete [] coords[0] ;
-
-   /* Write out the materials */
-   int *matnums = new int[domain.numReg()];
-   int dims[1] = {domain.numElem()}; // No mixed elements
-   for(int i=0 ; i<domain.numReg() ; ++i)
-      matnums[i] = i+1;
-   
-   ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(),
-                       matnums, domain.regNumList(), dims, 1,
-                       NULL, NULL, NULL, NULL, 0, DB_FLOAT, NULL);
-   delete [] matnums;
-
-   /* Write out pressure, energy, relvol, q */
-
-   float *e = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      e[ei] = float(domain.e[ei]) ;
-   }
-   ok += DBPutUcdvar1(db, "e", "mesh", e,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] e ;
-
-
-   float *p = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      p[ei] = float(domain.p[ei]) ;
-   }
-   ok += DBPutUcdvar1(db, "p", "mesh", p,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] p ;
-
-   float *v = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      v[ei] = float(domain.v[ei]) ;
-   }
-   ok += DBPutUcdvar1(db, "v", "mesh", v,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] v ;
-
-   float *q = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      q[ei] = float(domain.q[ei]) ;
-   }
-   ok += DBPutUcdvar1(db, "q", "mesh", q,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] q ;
-
-   /* Write out nodal speed, velocities */
-   float *zd    = new float[domain.numNode()];
-   float *yd    = new float[domain.numNode()];
-   float *xd    = new float[domain.numNode()];
-   float *speed = new float[domain.numNode()];
-   for(int ni=0 ; ni < domain.numNode() ; ++ni) {
-      xd[ni]    = float(domain.xd[ni]);
-      yd[ni]    = float(domain.yd[ni]);
-      zd[ni]    = float(domain.zd[ni]);
-      speed[ni] = float(sqrt((xd[ni]*xd[ni])+(yd[ni]*yd[ni])+(zd[ni]*zd[ni])));
-   }
-
-   ok += DBPutUcdvar1(db, "speed", "mesh", speed,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] speed;
-
-
-   ok += DBPutUcdvar1(db, "xd", "mesh", xd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] xd ;
-
-   ok += DBPutUcdvar1(db, "yd", "mesh", yd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] yd ;
-
-   ok += DBPutUcdvar1(db, "zd", "mesh", zd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] zd ;
-
-
-   if (ok != 0) {
-      printf("Error writing out viz file - rank %d\n", myRank);
-   }
-}
-
-/**********************************************************************/
-
-#if USE_MPI     
-void
-   DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                         char basename[], int numRanks)
-#else
-void
-  DumpMultiblockObjects(DBfile *db, char basename[], int numRanks)
-#endif
-{
-   /* MULTIBLOCK objects to tie together multiple files */
-  char **multimeshObjs;
-  char **multimatObjs;
-  char ***multivarObjs;
-  int *blockTypes;
-  int *varTypes;
-  int ok = 0;
-  // Make sure this list matches what's written out above
-  char vars[][10] = {"p","e","v","q", "speed", "xd", "yd", "zd"};
-  int numvars = sizeof(vars)/sizeof(vars[0]);
-
-  // Reset to the root directory of the silo file
-  DBSetDir(db, "/");
-
-  // Allocate a bunch of space for building up the string names
-  multimeshObjs = new char*[numRanks];
-  multimatObjs = new char*[numRanks];
-  multivarObjs = new char**[numvars];
-  blockTypes = new int[numRanks];
-  varTypes = new int[numRanks];
-
-  for(int v=0 ; v<numvars ; ++v) {
-     multivarObjs[v] = new char*[numRanks];
-  }
-  
-  for(int i=0 ; i<numRanks ; ++i) {
-     multimeshObjs[i] = new char[64];
-     multimatObjs[i] = new char[64];
-     for(int v=0 ; v<numvars ; ++v) {
-        multivarObjs[v][i] = new char[64];
-     }
-     blockTypes[i] = DB_UCDMESH;
-     varTypes[i] = DB_UCDVAR;
-  }
-      
-  // Build up the multiobject names
-  for(int i=0 ; i<numRanks ; ++i) {
-#if USE_MPI     
-    int iorank = PMPIO_GroupRank(bat, i);
-#else
-    int iorank = 0;
-#endif
-
-    //delete multivarObjs[i];
-    if (iorank == 0) {
-      snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i);
-      snprintf(multimatObjs[i], 64, "/data_%d/regions",i);
-      for(int v=0 ; v<numvars ; ++v) {
-	snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]);
-      }
-     
-    }
-    else {
-      snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh",
-               basename, iorank, i);
-      snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", 
-	       basename, iorank, i);
-      for(int v=0 ; v<numvars ; ++v) {
-         snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", 
-                  basename, iorank, i, vars[v]);
-      }
-    }
-  }
-
-  // Now write out the objects
-  ok += DBPutMultimesh(db, "mesh", numRanks,
-		       (char**)multimeshObjs, blockTypes, NULL);
-  ok += DBPutMultimat(db, "regions", numRanks,
-		      (char**)multimatObjs, NULL);
-  for(int v=0 ; v<numvars ; ++v) {
-     ok += DBPutMultivar(db, vars[v], numRanks,
-                         (char**)multivarObjs[v], varTypes, NULL);
-  }
-
-  for(int v=0; v < numvars; ++v) {
-    for(int i = 0; i < numRanks; i++) {
-      delete multivarObjs[v][i];
-    }
-    delete multivarObjs[v];
-  }
-
-  // Clean up
-  for(int i=0 ; i<numRanks ; i++) {
-    delete multimeshObjs[i];
-    delete multimatObjs[i];
-  }
-  delete [] multimeshObjs;
-  delete [] multimatObjs;
-  delete [] multivarObjs;
-  delete [] blockTypes;
-  delete [] varTypes;
-
-  if (ok != 0) {
-    printf("Error writing out multiXXX objs to viz file - rank 0\n");
-  }
-}
-
-# if USE_MPI
-
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata)
-{
-   /* Create the file */
-   DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata)
-{
-   /* Open the file */
-  DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void
-LULESH_PMPIO_Close(void *file, void *udata)
-{
-  DBfile *db = (DBfile*)file;
-  if (db)
-    DBClose(db);
-}
-# endif
-
-   
-#else
-
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks)
-{
-   if (myRank == 0) {
-      printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n");
-   }
-}
-
-#endif
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.cc
deleted file mode 100644
index f24ac380e..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.cc
+++ /dev/null
@@ -1,2639 +0,0 @@
-/*
-  This is a Version 2.0 MPI + OpenMP implementation of LULESH
-
-                 Copyright (c) 2010-2013.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 2.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-//////////////
-DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
-* Addition of regions to make work more representative of multi-material codes
-* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
-  more representative of our actual working set sizes
-* Single source distribution supports pure serial, pure OpenMP, MPI-only, 
-  and MPI+OpenMP
-* Addition of ability to visualize the mesh using VisIt 
-  https://wci.llnl.gov/codes/visit/download.html
-* Various command line options (see ./lulesh2.0 -h)
- -q              : quiet mode - suppress stdout
- -i <iterations> : number of cycles to run
- -s <size>       : length of cube mesh along side
- -r <numregions> : Number of distinct regions (def: 11)
- -b <balance>    : Load balance between regions of a domain (def: 1)
- -c <cost>       : Extra cost of more expensive regions (def: 1)
- -f <filepieces> : Number of file parts for viz output (def: np/9)
- -p              : Print out progress
- -v              : Output viz file (requires compiling with -DVIZ_MESH
- -h              : This message
-
- printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-
-*Notable changes in LULESH 2.0
-
-* Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-*
-* The concept of "regions" was added, although every region is the same ideal
-*    gas material, and the same sedov blast wave problem is still the only
-*    problem its hardcoded to solve.
-* Regions allow two things important to making this proxy app more representative:
-*   Four of the LULESH routines are now performed on a region-by-region basis,
-*     making the memory access patterns non-unit stride
-*   Artificial load imbalances can be easily introduced that could impact
-*     parallelization strategies.  
-* The load balance flag changes region assignment.  Region number is raised to
-*   the power entered for assignment probability.  Most likely regions changes
-*   with MPI process id.
-* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
-*   entered multiple. The cost of 5% is 10x the entered multiple.
-* MPI and OpenMP were added, and coalesced into a single version of the source
-*   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-* Added support to write plot files using "poor mans parallel I/O" when linked
-*   with the silo library, which in turn can be read by VisIt.
-* Enabled variable timestep calculation by default (courant condition), which
-*   results in an additional reduction.
-* Default domain (mesh) size reduced from 45^3 to 30^3
-* Command line options to allow numerous test cases without needing to recompile
-* Performance optimizations and code cleanup beyond LULESH 1.0
-* Added a "Figure of Merit" calculation (elements solved per microsecond) and
-*   output in support of using LULESH 2.0 for the 2017 CORAL procurement
-*
-* Possible Differences in Final Release (other changes possible)
-*
-* High Level mesh structure to allow data structure transformations
-* Different default parameters
-* Minor code performance changes and cleanup
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
-//////////////
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <climits>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
-#include <unistd.h>
-
-#if USE_OMP
-# include <omp.h>
-#endif
-
-#include "lulesh.h"
-
-#include "Timer.hxx"
-
-#define RAJA_STORAGE static inline
-//#define RAJA_STORAGE 
-
-/******************************************/
-
-/* Work Routines */
-
-RAJA_STORAGE
-void TimeIncrement(Domain& domain)
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t gnewdt = Real_t(1.0e+20) ;
-      Real_t newdt ;
-      if (domain.dtcourant() < gnewdt) {
-         gnewdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < gnewdt) {
-         gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-#if USE_MPI      
-      MPI_Allreduce(&gnewdt, &newdt, 1,
-                    ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
-#else
-      newdt = gnewdt;
-#endif
-      
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Domain* domain,
-                                   const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain->x[nd0i];
-   elemX[1] = domain->x[nd1i];
-   elemX[2] = domain->x[nd2i];
-   elemX[3] = domain->x[nd3i];
-   elemX[4] = domain->x[nd4i];
-   elemX[5] = domain->x[nd5i];
-   elemX[6] = domain->x[nd6i];
-   elemX[7] = domain->x[nd7i];
-
-   elemY[0] = domain->y[nd0i];
-   elemY[1] = domain->y[nd1i];
-   elemY[2] = domain->y[nd2i];
-   elemY[3] = domain->y[nd3i];
-   elemY[4] = domain->y[nd4i];
-   elemY[5] = domain->y[nd5i];
-   elemY[6] = domain->y[nd6i];
-   elemY[7] = domain->y[nd7i];
-
-   elemZ[0] = domain->z[nd0i];
-   elemZ[1] = domain->z[nd1i];
-   elemZ[2] = domain->z[nd2i];
-   elemZ[3] = domain->z[nd3i];
-   elemZ[4] = domain->z[nd4i];
-   elemZ[5] = domain->z[nd5i];
-   elemZ[6] = domain->z[nd6i];
-   elemZ[7] = domain->z[nd7i];
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void InitStressTermsForElems(Domain* domain,
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-      sigxx[i] = sigyy[i] = sigzz[i] =  - domain->p[i] - domain->q[i] ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemShapeFunctionDerivatives( Real_t const x[],
-                                       Real_t const y[],
-                                       Real_t const z[],
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* fx, Real_t* fy, Real_t* fz )
-{
-   for(Index_t i = 0; i < 8; i++) {
-      fx[i] = -( stress_xx * B[0][i] );
-      fy[i] = -( stress_yy * B[1][i]  );
-      fz[i] = -( stress_zz * B[2][i] );
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void IntegrateStressForElems( Domain* domain,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ, Index_t numElem)
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem;
-   Real_t *fy_elem;
-   Real_t *fz_elem;
-   Real_t fx_local[8] ;
-   Real_t fy_local[8] ;
-   Real_t fz_local[8] ;
-
-   Real_t* tfx_local = fx_local;
-   Real_t* tfy_local = fy_local;
-   Real_t* tfz_local = fz_local;
-
-
-  if (numthreads > 1) {
-     fx_elem = Allocate<Real_t>(numElem8) ;
-     fy_elem = Allocate<Real_t>(numElem8) ;
-     fz_elem = Allocate<Real_t>(numElem8) ;
-  }
-  // loop over all elements
-
-  RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int k) {
-    const Index_t* const elemToNode = &domain->nodelist[8*k];
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // Volume calculation involves extra work for numerical consistency
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    if (numthreads > 1) {
-       // Eliminate thread writing conflicts at the nodes by giving
-       // each element its own copy to write to
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    &fx_elem[k*8],
-                                    &fy_elem[k*8],
-                                    &fz_elem[k*8] ) ;
-    }
-    else {
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    tfx_local, tfy_local, tfz_local ) ;
-
-       // copy nodal force contributions to global force arrray.
-       for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
-          Index_t gnode = elemToNode[lnode];
-          domain->fx[gnode] += tfx_local[lnode];
-          domain->fy[gnode] += tfy_local[lnode];
-          domain->fz[gnode] += tfz_local[lnode];
-       }
-    }
-  } );
-
-  if (numthreads > 1) {
-     // If threaded, then we need to copy the data out of the temporary
-     // arrays used above into the final forces field
-     RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int gnode) {
-        Index_t count = domain->nodeElemCount(gnode) ;
-        Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-        Real_t fx_tmp = Real_t(0.0) ;
-        Real_t fy_tmp = Real_t(0.0) ;
-        Real_t fz_tmp = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t ielem = cornerList[i] ;
-           fx_tmp += fx_elem[ielem] ;
-           fy_tmp += fy_elem[ielem] ;
-           fz_tmp += fz_elem[ielem] ;
-        }
-        domain->fx[gnode] = fx_tmp ;
-        domain->fy[gnode] = fy_tmp ;
-        domain->fz[gnode] = fz_tmp ;
-     } );
-     Release(&fz_elem) ;
-     Release(&fy_elem) ;
-     Release(&fx_elem) ;
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
-                              Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Real_t hxx[4];
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
-               hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
-               hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
-               hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfx[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
-               hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
-               hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
-               hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfy[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
-               hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
-               hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
-               hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfz[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Domain* domain,
-                                   Real_t *determ,
-                                   Real_t *x8n, Real_t *y8n, Real_t *z8n,
-                                   Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
-                                   Real_t hourg, Index_t numElem)
-{
-
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-  
-   Index_t numElem8 = numElem * 8 ;
-
-   Real_t *fx_elem; 
-   Real_t *fy_elem; 
-   Real_t *fz_elem; 
-
-   if(numthreads > 1) {
-      fx_elem = Allocate<Real_t>(numElem8) ;
-      fy_elem = Allocate<Real_t>(numElem8) ;
-      fz_elem = Allocate<Real_t>(numElem8) ;
-   }
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i2) {
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam[8][4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = &domain->nodelist[8*i2];
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain->ss[i2];
-      mass1=domain->elemMass[i2];
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain->xd[n0si2];
-      xd1[1] = domain->xd[n1si2];
-      xd1[2] = domain->xd[n2si2];
-      xd1[3] = domain->xd[n3si2];
-      xd1[4] = domain->xd[n4si2];
-      xd1[5] = domain->xd[n5si2];
-      xd1[6] = domain->xd[n6si2];
-      xd1[7] = domain->xd[n7si2];
-
-      yd1[0] = domain->yd[n0si2];
-      yd1[1] = domain->yd[n1si2];
-      yd1[2] = domain->yd[n2si2];
-      yd1[3] = domain->yd[n3si2];
-      yd1[4] = domain->yd[n4si2];
-      yd1[5] = domain->yd[n5si2];
-      yd1[6] = domain->yd[n6si2];
-      yd1[7] = domain->yd[n7si2];
-
-      zd1[0] = domain->zd[n0si2];
-      zd1[1] = domain->zd[n1si2];
-      zd1[2] = domain->zd[n2si2];
-      zd1[3] = domain->zd[n3si2];
-      zd1[4] = domain->zd[n4si2];
-      zd1[5] = domain->zd[n5si2];
-      zd1[6] = domain->zd[n6si2];
-      zd1[7] = domain->zd[n7si2];
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      // With the threaded version, we write into local arrays per elem
-      // so we don't have to worry about race conditions
-      if (numthreads > 1) {
-         fx_local = &fx_elem[i3] ;
-         fx_local[0] = hgfx[0];
-         fx_local[1] = hgfx[1];
-         fx_local[2] = hgfx[2];
-         fx_local[3] = hgfx[3];
-         fx_local[4] = hgfx[4];
-         fx_local[5] = hgfx[5];
-         fx_local[6] = hgfx[6];
-         fx_local[7] = hgfx[7];
-
-         fy_local = &fy_elem[i3] ;
-         fy_local[0] = hgfy[0];
-         fy_local[1] = hgfy[1];
-         fy_local[2] = hgfy[2];
-         fy_local[3] = hgfy[3];
-         fy_local[4] = hgfy[4];
-         fy_local[5] = hgfy[5];
-         fy_local[6] = hgfy[6];
-         fy_local[7] = hgfy[7];
-
-         fz_local = &fz_elem[i3] ;
-         fz_local[0] = hgfz[0];
-         fz_local[1] = hgfz[1];
-         fz_local[2] = hgfz[2];
-         fz_local[3] = hgfz[3];
-         fz_local[4] = hgfz[4];
-         fz_local[5] = hgfz[5];
-         fz_local[6] = hgfz[6];
-         fz_local[7] = hgfz[7];
-      }
-      else {
-         domain->fx[n0si2] += hgfx[0];
-         domain->fy[n0si2] += hgfy[0];
-         domain->fz[n0si2] += hgfz[0];
-
-         domain->fx[n1si2] += hgfx[1];
-         domain->fy[n1si2] += hgfy[1];
-         domain->fz[n1si2] += hgfz[1];
-
-         domain->fx[n2si2] += hgfx[2];
-         domain->fy[n2si2] += hgfy[2];
-         domain->fz[n2si2] += hgfz[2];
-
-         domain->fx[n3si2] += hgfx[3];
-         domain->fy[n3si2] += hgfy[3];
-         domain->fz[n3si2] += hgfz[3];
-
-         domain->fx[n4si2] += hgfx[4];
-         domain->fy[n4si2] += hgfy[4];
-         domain->fz[n4si2] += hgfz[4];
-
-         domain->fx[n5si2] += hgfx[5];
-         domain->fy[n5si2] += hgfy[5];
-         domain->fz[n5si2] += hgfz[5];
-
-         domain->fx[n6si2] += hgfx[6];
-         domain->fy[n6si2] += hgfy[6];
-         domain->fz[n6si2] += hgfz[6];
-
-         domain->fx[n7si2] += hgfx[7];
-         domain->fy[n7si2] += hgfy[7];
-         domain->fz[n7si2] += hgfz[7];
-      }
-   } );
-
-   if (numthreads > 1) {
-     // Collect the data from the local arrays into the final force arrays
-      RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int gnode) {
-         Index_t count = domain->nodeElemCount(gnode) ;
-         Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-         Real_t fx_tmp = Real_t(0.0) ;
-         Real_t fy_tmp = Real_t(0.0) ;
-         Real_t fz_tmp = Real_t(0.0) ;
-         for (Index_t i=0 ; i < count ; ++i) {
-            Index_t ielem = cornerList[i] ;
-            fx_tmp += fx_elem[ielem] ;
-            fy_tmp += fy_elem[ielem] ;
-            fz_tmp += fz_elem[ielem] ;
-         }
-         domain->fx[gnode] += fx_tmp ;
-         domain->fy[gnode] += fy_tmp ;
-         domain->fz[gnode] += fz_tmp ;
-      } );
-      Release(&fz_elem) ;
-      Release(&fy_elem) ;
-      Release(&fx_elem) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain* domain,
-                                  Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = &domain->nodelist[8*i];
-      CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii) {
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain->volo[i] * domain->v[i];
-
-      /* Do a check for negative volumes */
-      if ( domain->v[i] <= Real_t(0.0) ) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-   } );
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, numElem ) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain, sigxx, sigyy, sigzz);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain,
-                               sigxx, sigyy, sigzz, determ, numElem );
-
-      // check for negative element volume
-      RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int k) {
-         if (determ[k] <= Real_t(0.0)) {
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-            exit(VolumeError);
-#endif
-         }
-      } );
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE void CalcForceForNodes(Domain* domain)
-{
-#if USE_MPI  
-  CommRecv(*domain, MSG_COMM_SBN, 3,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-           true, false) ;
-#endif  
-
-  RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int i) {
-     domain->fx[i] = Real_t(0.0) ;
-     domain->fy[i] = Real_t(0.0) ;
-     domain->fz[i] = Real_t(0.0) ;
-  } );
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-#if USE_MPI  
-  Domain_member fieldData[3] ;
-  fieldData[0] = &Domain::fx ;
-  fieldData[1] = &Domain::fy ;
-  fieldData[2] = &Domain::fz ;
-  
-  CommSend(*domain, MSG_COMM_SBN, 3, fieldData,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() +  1,
-           true, false) ;
-  CommSBN(*domain, 3, fieldData) ;
-#endif  
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Domain* domain)
-{
-   
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int i) {
-      domain->xdd[i] = domain->fx[i] / domain->nodalMass[i];
-      domain->ydd[i] = domain->fy[i] / domain->nodalMass[i];
-      domain->zdd[i] = domain->fz[i] / domain->nodalMass[i];
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Domain* domain)
-{
-   RAJA::forall<symnode_exec_policy>(domain->getXSymNodeISet(), [=] (int i) {
-      domain->xdd[i] = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(domain->getYSymNodeISet(), [=] (int i) {
-      domain->ydd[i] = Real_t(0.0) ;
-   } );
-
-   RAJA::forall<symnode_exec_policy>(domain->getZSymNodeISet(), [=] (int i) {
-      domain->zdd[i] = Real_t(0.0) ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Domain* domain, const Real_t dt, const Real_t u_cut)
-{
-
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain->xd[i] + domain->xdd[i] * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain->xd[i] = xdtmp ;
-
-     ydtmp = domain->yd[i] + domain->ydd[i] * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain->yd[i] = ydtmp ;
-
-     zdtmp = domain->zd[i] + domain->zdd[i] * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain->zd[i] = zdtmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPositionForNodes(Domain* domain, const Real_t dt)
-{
-   RAJA::forall<node_exec_policy>(domain->getNodeISet(), [=] (int i) {
-     domain->x[i] += domain->xd[i] * dt ;
-     domain->y[i] += domain->yd[i] * dt ;
-     domain->z[i] += domain->zd[i] * dt ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeNodal(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   Domain_member fieldData[6] ;
-#endif
-
-   const Real_t delt = domain->deltatime() ;
-   Real_t u_cut = domain->u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-#if USE_MPI  
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif
-   
-   CalcAccelerationForNodes(domain);
-   
-   ApplyAccelerationBoundaryConditionsForNodes(domain);
-
-   CalcVelocityForNodes( domain, delt, u_cut) ;
-
-   CalcPositionForNodes( domain, delt );
-#if USE_MPI
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-  fieldData[0] = &Domain::x ;
-  fieldData[1] = &Domain::y ;
-  fieldData[2] = &Domain::z ;
-  fieldData[3] = &Domain::xd ;
-  fieldData[4] = &Domain::yd ;
-  fieldData[5] = &Domain::zd ;
-
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-   CommSyncPosVel(*domain) ;
-#endif
-#endif
-   
-  return;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-/******************************************/
-
-//inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-/******************************************/
-
-//RAJA_STORAGE
-void CalcKinematicsForElems( Domain* domain,
-                             Real_t deltaTime, Index_t numElem )
-{
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int k) { 
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = &domain->nodelist[8*k] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain->volo[k] ;
-    domain->vnew[k] = relativeVolume ;
-    domain->delv[k] = relativeVolume - domain->v[k] ;
-
-    // set characteristic length
-    domain->arealg[k] = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain->xd[gnode];
-      yd_local[lnode] = domain->yd[gnode];
-      zd_local[lnode] = domain->zd[gnode]; 
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGradient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain->dxx[k] = D[0];
-    domain->dyy[k] = D[1];
-    domain->dzz[k] = D[2];
-  } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime() ;
-
-      domain->AllocateStrains(numElem);
-
-      CalcKinematicsForElems(domain, deltatime, numElem) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int k) {
-         // calc strain rate and apply as constraint (only done in FB element)
-         Real_t vdov = domain->dxx[k] + domain->dyy[k] + domain->dzz[k] ;
-         Real_t vdovthird = vdov/Real_t(3.0) ;
-
-         // make the rate of deformation tensor deviatoric
-         domain->vdov[k] = vdov ;
-         domain->dxx[k] -= vdovthird ;
-         domain->dyy[k] -= vdovthird ;
-         domain->dzz[k] -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-         if (domain->vnew[k] <= Real_t(0.0))
-        {
-#if USE_MPI           
-           MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-           exit(VolumeError);
-#endif
-        }
-      } );
-      domain->DeallocateStrains();
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem();
-
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = &domain->nodelist[8*i];
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain->x[n0] ;
-      Real_t x1 = domain->x[n1] ;
-      Real_t x2 = domain->x[n2] ;
-      Real_t x3 = domain->x[n3] ;
-      Real_t x4 = domain->x[n4] ;
-      Real_t x5 = domain->x[n5] ;
-      Real_t x6 = domain->x[n6] ;
-      Real_t x7 = domain->x[n7] ;
-
-      Real_t y0 = domain->y[n0] ;
-      Real_t y1 = domain->y[n1] ;
-      Real_t y2 = domain->y[n2] ;
-      Real_t y3 = domain->y[n3] ;
-      Real_t y4 = domain->y[n4] ;
-      Real_t y5 = domain->y[n5] ;
-      Real_t y6 = domain->y[n6] ;
-      Real_t y7 = domain->y[n7] ;
-
-      Real_t z0 = domain->z[n0] ;
-      Real_t z1 = domain->z[n1] ;
-      Real_t z2 = domain->z[n2] ;
-      Real_t z3 = domain->z[n3] ;
-      Real_t z4 = domain->z[n4] ;
-      Real_t z5 = domain->z[n5] ;
-      Real_t z6 = domain->z[n6] ;
-      Real_t z7 = domain->z[n7] ;
-
-      Real_t xv0 = domain->xd[n0] ;
-      Real_t xv1 = domain->xd[n1] ;
-      Real_t xv2 = domain->xd[n2] ;
-      Real_t xv3 = domain->xd[n3] ;
-      Real_t xv4 = domain->xd[n4] ;
-      Real_t xv5 = domain->xd[n5] ;
-      Real_t xv6 = domain->xd[n6] ;
-      Real_t xv7 = domain->xd[n7] ;
-
-      Real_t yv0 = domain->yd[n0] ;
-      Real_t yv1 = domain->yd[n1] ;
-      Real_t yv2 = domain->yd[n2] ;
-      Real_t yv3 = domain->yd[n3] ;
-      Real_t yv4 = domain->yd[n4] ;
-      Real_t yv5 = domain->yd[n5] ;
-      Real_t yv6 = domain->yd[n6] ;
-      Real_t yv7 = domain->yd[n7] ;
-
-      Real_t zv0 = domain->zd[n0] ;
-      Real_t zv1 = domain->zd[n1] ;
-      Real_t zv2 = domain->zd[n2] ;
-      Real_t zv3 = domain->zd[n3] ;
-      Real_t zv4 = domain->zd[n4] ;
-      Real_t zv5 = domain->zd[n5] ;
-      Real_t zv6 = domain->zd[n6] ;
-      Real_t zv7 = domain->zd[n7] ;
-
-      Real_t vol = domain->volo[i]*domain->vnew[i] ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
-      Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
-      Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
-      Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
-      Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
-      Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
-      Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain->delx_zeta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
-      dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
-      dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
-
-      domain->delv_zeta[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain->delx_xi[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
-      dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
-      dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
-
-      domain->delv_xi[i] = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain->delx_eta[i] = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
-      dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
-      dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
-
-      domain->delv_eta[i] = ax*dxv + ay*dyv + az*dzv ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(Domain* domain, Int_t r,
-                                  Real_t ptiny)
-{
-   Real_t monoq_limiter_mult = domain->monoq_limiter_mult();
-   Real_t monoq_max_slope = domain->monoq_max_slope();
-   Real_t qlc_monoq = domain->qlc_monoq();
-   Real_t qqc_monoq = domain->qqc_monoq();
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(r), [=] (int ielem) {
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = domain->elemBC[ielem] ;
-      Real_t delvm = 0.0, delvp =0.0;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / (domain->delv_xi[ielem]+ ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case XI_M_COMM: /* needs comm data */
-         case 0:         delvm = domain->delv_xi[domain->lxim[ielem]]; break ;
-         case XI_M_SYMM: delvm = domain->delv_xi[ielem] ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & XI_P) {
-         case XI_P_COMM: /* needs comm data */
-         case 0:         delvp = domain->delv_xi[domain->lxip[ielem]] ; break ;
-         case XI_P_SYMM: delvp = domain->delv_xi[ielem] ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain->delv_eta[ielem] + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case ETA_M_COMM: /* needs comm data */
-         case 0:          delvm = domain->delv_eta[domain->letam[ielem]] ; break ;
-         case ETA_M_SYMM: delvm = domain->delv_eta[ielem] ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ETA_P) {
-         case ETA_P_COMM: /* needs comm data */
-         case 0:          delvp = domain->delv_eta[domain->letap[ielem]] ; break ;
-         case ETA_P_SYMM: delvp = domain->delv_eta[ielem] ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain->delv_zeta[ielem] + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case ZETA_M_COMM: /* needs comm data */
-         case 0:           delvm = domain->delv_zeta[domain->lzetam[ielem]] ; break ;
-         case ZETA_M_SYMM: delvm = domain->delv_zeta[ielem] ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ZETA_P) {
-         case ZETA_P_COMM: /* needs comm data */
-         case 0:           delvp = domain->delv_zeta[domain->lzetap[ielem]] ; break ;
-         case ZETA_P_SYMM: delvp = domain->delv_zeta[ielem] ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain->vdov[ielem] > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain->delv_xi[ielem]   * domain->delx_xi[ielem]   ;
-         Real_t delvxeta  = domain->delv_eta[ielem]  * domain->delx_eta[ielem]  ;
-         Real_t delvxzeta = domain->delv_zeta[ielem] * domain->delx_zeta[ielem] ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain->elemMass[ielem] / (domain->volo[ielem] * domain->vnew[ielem]) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain->qq[ielem] = qquad ;
-      domain->ql[ielem] = qlin  ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain* domain)
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   //
-   // calculate the monotonic q for all regions
-   //
-   for (Index_t r=0 ; r<domain->numReg() ; ++r) {
-      if (domain->regElemSize(r) > 0) {
-         CalcMonotonicQRegionForElems(domain, r, ptiny) ;
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcQForElems(Domain* domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem() ;
-
-   if (numElem != 0) {
-      Int_t allElem = numElem +  /* local elem */
-            2*domain->sizeX()*domain->sizeY() + /* plane ghosts */
-            2*domain->sizeX()*domain->sizeZ() + /* row ghosts */
-            2*domain->sizeY()*domain->sizeZ() ; /* col ghosts */
-
-      domain->AllocateGradients(numElem, allElem);
-
-#if USE_MPI
-      CommRecv(*domain, MSG_MONOQ, 3,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-#endif      
-
-      /* Calculate velocity gradients */
-      CalcMonotonicQGradientsForElems(domain);
-
-#if USE_MPI      
-      Domain_member fieldData[3] ;
-      
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      fieldData[0] = &Domain::delv_xi ;
-      fieldData[1] = &Domain::delv_eta ;
-      fieldData[2] = &Domain::delv_zeta ;
-
-      CommSend(*domain, MSG_MONOQ, 3, fieldData,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-
-      CommMonoQ(*domain) ;
-#endif      
-
-      CalcMonotonicQForElems(domain) ;
-
-      // Free up memory
-      domain->DeallocateGradients();
-
-      /* Don't allow excessive artificial viscosity */
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain->q[i] > domain->qstop() ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
-#else
-         exit(QStopError);
-#endif
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[ielem] = c1s * (compression[ielem] + Real_t(1.));
-      pbvc[ielem] = c1s;
-   } );
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {
-      p_new[ielem] = bvc[ielem] * e_old[ielem] ;
-
-      if    (FABS(p_new[ielem]) <  p_cut   )
-         p_new[ielem] = Real_t(0.0) ;
-
-      if    ( vnewc[ielem] >= eosvmax ) /* impossible condition here? */
-         p_new[ielem] = Real_t(0.0) ;
-
-      if    (p_new[ielem]       <  pmin)
-         p_new[ielem]   = pmin ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcEnergyForElems(Domain* domain,
-                        Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t *pHalfStep,
-                        Real_t pmin, Real_t p_cut, Real_t  e_cut,
-                        Real_t q_cut, Real_t emin,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {  
-      e_new[ielem] = domain->e[ielem]
-         - Real_t(0.5) * domain->delv[ielem] * (p_old[ielem] + domain->q[ielem])
-         + Real_t(0.5) * work[ielem];
-
-      if (e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {  
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[ielem]) ;
-
-      if ( domain->delv[ielem] > Real_t(0.) ) {
-         q_new[ielem] /* = domain->qq[ielem] = domain->ql[ielem] */ = Real_t(0.);
-      }
-      else {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-                 + vhalf * vhalf * bvc[ielem] * pHalfStep[ielem] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[ielem] = (ssc*domain->ql[ielem] + domain->qq[ielem]) ;
-      }
-
-      e_new[ielem] = e_new[ielem] + Real_t(0.5) * domain->delv[ielem]
-         * (  Real_t(3.0)*(p_old[ielem]     + domain->q[ielem])
-              - Real_t(4.0)*(pHalfStep[ielem] + q_new[ielem])) ;
-   } );
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {  
-      e_new[ielem] += Real_t(0.5) * work[ielem];
-
-      if (FABS(e_new[ielem]) < e_cut) {
-         e_new[ielem] = Real_t(0.)  ;
-      }
-      if (     e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {  
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Real_t q_tilde ;
-
-      if (domain->delv[ielem] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[ielem] * p_new[ielem] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*domain->ql[ielem] + domain->qq[ielem]) ;
-      }
-
-      e_new[ielem] -= (  Real_t(7.0)*(p_old[ielem]     + domain->q[ielem])
-                       - Real_t(8.0)*(pHalfStep[ielem] + q_new[ielem])
-                       + (p_new[ielem] + q_tilde)) * domain->delv[ielem]*sixth ;
-
-      if (FABS(e_new[ielem]) < e_cut) {
-         e_new[ielem] = Real_t(0.)  ;
-      }
-      if (     e_new[ielem]  < emin ) {
-         e_new[ielem] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, 
-                        regISet);
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {
-      if ( domain->delv[ielem] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[ielem] * e_new[ielem]
-            + vnewc[ielem] * vnewc[ielem] * bvc[ielem] * p_new[ielem] ) / rho0;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[ielem] = (ssc*domain->ql[ielem] + domain->qq[ielem]) ;
-
-         if (FABS(q_new[ielem]) < q_cut) q_new[ielem] = Real_t(0.) ;
-      }
-   } );
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(Domain* domain,
-                            Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3,
-                            LULESH_ISET& regISet)
-{
-   RAJA::forall<mat_exec_policy>(regISet, [=] (int ielem) {
-      Real_t ssTmp = (pbvc[ielem] * enewc[ielem] + vnewc[ielem] * vnewc[ielem] *
-                 bvc[ielem] * pnewc[ielem]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      domain->ss[ielem] = ssTmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain* domain,
-                     Real_t *vnewc, Real_t *p_old,
-                     Real_t *compression, Real_t *compHalfStep,
-                     Real_t *work, Real_t *p_new, Real_t *e_new,
-                     Real_t *q_new, Real_t *bvc, Real_t *pbvc,
-                     Real_t *pHalfStep, Int_t reg_num, Int_t rep)
-{
-   Real_t  e_cut = domain->e_cut() ;
-   Real_t  p_cut = domain->p_cut() ;
-   Real_t  ss4o3 = domain->ss4o3() ;
-   Real_t  q_cut = domain->q_cut() ;
-
-   Real_t eosvmax = domain->eosvmax() ;
-   Real_t eosvmin = domain->eosvmin() ;
-   Real_t pmin    = domain->pmin() ;
-   Real_t emin    = domain->emin() ;
-   Real_t rho0    = domain->refdens() ;
-
-   LULESH_ISET& regISet = domain->getRegionISet(reg_num);
-   Int_t numElemReg = regISet.getLength();
- 
-   //loop to add load imbalance based on region number 
-   for(Int_t j = 0; j < rep; j++) {
-      /* compress data, minimal set */
-      RAJA::forall<mat_exec_policy>(regISet, [=] (Index_t ielem) {
-         p_old[ielem] = domain->p[ielem] ;
-         work[ielem] = Real_t(0.0) ;
-      } );
-
-      RAJA::forall<mat_exec_policy>(regISet, [=] (Index_t ielem) {
-         Real_t vchalf ;
-         compression[ielem] = Real_t(1.) / vnewc[ielem] - Real_t(1.);
-         vchalf = vnewc[ielem] - domain->delv[ielem] * Real_t(.5);
-         compHalfStep[ielem] = Real_t(1.) / vchalf - Real_t(1.);
-      } );
-
-      /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(regISet, [=] (Index_t ielem) {
-            if (vnewc[ielem] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[ielem] = compression[ielem] ;
-            }
-         } );
-      }
-
-      if ( eosvmax != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(regISet, [=] (Index_t ielem) {
-            if (vnewc[ielem] >= eosvmax) { /* impossible due to calling func? */
-               p_old[ielem]        = Real_t(0.) ;
-               compression[ielem]  = Real_t(0.) ;
-               compHalfStep[ielem] = Real_t(0.) ;
-            }
-         } );
-      }
-
-      CalcEnergyForElems(domain, p_new, e_new, q_new, bvc, pbvc,
-                         p_old, compression, compHalfStep,
-                         vnewc, work, pHalfStep, pmin,
-                         p_cut, e_cut, q_cut, emin,
-                         rho0, eosvmax,
-                         regISet);
-   }
-
-   RAJA::forall<mat_exec_policy>(regISet, [=] (Index_t ielem) {
-      domain->p[ielem] = p_new[ielem] ;
-      domain->e[ielem] = e_new[ielem] ;
-      domain->q[ielem] = q_new[ielem] ;
-   } );
-
-   CalcSoundSpeedForElems(domain,
-                          vnewc, rho0, e_new, p_new,
-                          pbvc, bvc, ss4o3,
-                          regISet) ;
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin() ;
-    Real_t eosvmax = domain->eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(numElem) ;
-    Real_t *p_old = Allocate<Real_t>(numElem) ;
-    Real_t *compression = Allocate<Real_t>(numElem) ;
-    Real_t *compHalfStep = Allocate<Real_t>(numElem) ;
-    Real_t *work = Allocate<Real_t>(numElem) ;
-    Real_t *p_new = Allocate<Real_t>(numElem) ;
-    Real_t *e_new = Allocate<Real_t>(numElem) ;
-    Real_t *q_new = Allocate<Real_t>(numElem) ;
-    Real_t *bvc = Allocate<Real_t>(numElem) ;
-    Real_t *pbvc = Allocate<Real_t>(numElem) ;
-    Real_t *pHalfStep = Allocate<Real_t>(numElem) ;
-
-
-    RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-       vnewc[i] = domain->vnew[i] ;
-    } );
-
-    // Bound the updated relative volumes with eosvmin/max
-    if (eosvmin != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-          if (vnewc[i] < eosvmin)
-             vnewc[i] = eosvmin ;
-       } );
-    }
-
-    if (eosvmax != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-          if (vnewc[i] > eosvmax)
-             vnewc[i] = eosvmax ;
-       } );
-    }
-
-    // This check may not make perfect sense in LULESH, but
-    // it's representative of something in the full code -
-    // just leave it in, please
-    RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) {
-       Real_t vc = domain->v[i] ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = -1.0 ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = -1.0 ;
-       }
-       if (vc <= 0.) {
-#if USE_MPI             
-          MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-          exit(VolumeError);
-#endif
-       }
-    } );
-
-    for (Int_t reg_num=0 ; reg_num < domain->numReg() ; reg_num++) {
-       Int_t rep;
-       //Determine load imbalance for this region
-       //round down the number with lowest cost
-       if(reg_num < domain->numReg()/2)
-	 rep = 1;
-       //you don't get an expensive region unless you at least have 5 regions
-       else if(reg_num < (domain->numReg() - (domain->numReg()+15)/20))
-         rep = 1 + domain->cost();
-       //very expensive regions
-       else
-	 rep = 10 * (1+ domain->cost());
-       EvalEOSForElems(domain, vnewc, p_old, compression, compHalfStep,
-                       work, p_new, e_new, q_new, bvc, pbvc, pHalfStep,
-                       reg_num, rep);
-    }
-
-    Release(&pHalfStep) ;
-    Release(&pbvc) ;
-    Release(&bvc) ;
-    Release(&q_new) ;
-    Release(&e_new) ;
-    Release(&p_new) ;
-    Release(&work) ;
-    Release(&compHalfStep) ;
-    Release(&compression) ;
-    Release(&p_old) ;
-    Release(&vnewc) ;
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void UpdateVolumesForElems(Domain* domain, 
-                           Real_t v_cut)
-{
-   RAJA::forall<elem_exec_policy>(domain->getElemISet(), [=] (int i) { 
-      Real_t tmpV = domain->vnew[i] ;
-
-      if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-         tmpV = Real_t(1.0) ;
-
-      domain->v[i] = tmpV ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeElements(Domain* domain, Index_t numElem)
-{
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain,
-                        domain->v_cut()) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(Domain* domain, int reg_num,
-                                   Real_t qqc, Real_t& dtcourant)
-{
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(dtcourant) ;
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(reg_num), [=] (int indx) {
-
-      Real_t dtf = domain->ss[indx] * domain->ss[indx] ;
-
-      if ( domain->vdov[indx] < Real_t(0.) ) {
-         dtf += qqc2 * domain->arealg[indx] * domain->arealg[indx] *
-                domain->vdov[indx] * domain->vdov[indx] ;
-      }
-
-      Real_t dtf_cmp = (domain->vdov[indx] != Real_t(0.))
-                     ?  domain->arealg[indx] / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(Domain* domain, int reg_num,
-                                 Real_t dvovmax, Real_t& dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(dthydro) ;
-
-   RAJA::forall<mat_exec_policy>(domain->getRegionISet(reg_num), [=] (int indx) {
-
-       Real_t dtvov_cmp = (domain->vdov[indx] != Real_t(0.))
-                        ? (dvovmax / (FABS(domain->vdov[indx])+Real_t(1.e-20)))
-                        : Real_t(1.0e+20) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain* domain) {
-
-   // Initialize conditions to a very large value
-   domain->dtcourant() = 1.0e+20;
-   domain->dthydro() = 1.0e+20;
-
-   for (Index_t reg_num=0 ; reg_num < domain->numReg() ; ++reg_num) {
-      /* evaluate time constraint */
-      CalcCourantConstraintForElems(domain, reg_num,
-                                    domain->qqc(),
-                                    domain->dtcourant()) ;
-
-      /* check hydro constraint */
-      CalcHydroConstraintForElems(domain, reg_num,
-                                  domain->dvovmax(),
-                                  domain->dthydro()) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   Domain_member fieldData[6] ;
-#endif
-
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-#endif
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem());
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ; 
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-   
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif   
-
-   CalcTimeConstraintsForElems(domain);
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommSyncPosVel(*domain) ;
-#endif
-#endif   
-}
-
-
-/******************************************/
-
-int main(int argc, char *argv[])
-{
-   Domain *locDom ;
-   Int_t numRanks ;
-   Int_t myRank ;
-   struct cmdLineOpts opts;
-
-#if USE_MPI   
-   Domain_member fieldData ;
-
-   MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-#else
-   numRanks = 1;
-   myRank = 0;
-#endif   
-
-   /* Set defaults that can be overridden by command line opts */
-   opts.its = 9999999;
-   opts.nx  = 30;
-   opts.numReg = 11;
-   opts.numFiles = (int)(numRanks+10)/9;
-   opts.showProg = 0;
-   opts.quiet = 0;
-   opts.viz = 0;
-   opts.balance = 1;
-   opts.cost = 1;
-
-   ParseCommandLineOptions(argc, argv, myRank, &opts);
-
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      printf("Running problem size %d^3 per domain until completion\n", opts.nx);
-      printf("Num processors: %d\n", numRanks);
-#if USE_OMP
-      printf("Num threads: %d\n", omp_get_max_threads());
-#endif
-      printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
-      printf("To run other sizes, use -s <integer>.\n");
-      printf("To run a fixed number of iterations, use -i <integer>.\n");
-      printf("To run a more or less balanced region set, use -b <integer>.\n");
-      printf("To change the relative costs of regions, use -c <integer>.\n");
-      printf("To print out progress, use -p\n");
-      printf("To write an output file for VisIt, use -v\n");
-      printf("See help (-h) for more options\n\n");
-   }
-
-   // Set up the mesh and decompose. Assumes regular cubes for now
-   Int_t col, row, plane, side;
-   InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
-
-   // Build the main data structure and initialize it
-   locDom = new Domain(numRanks, col, row, plane, opts.nx,
-                       side, opts.numReg, opts.balance, opts.cost) ;
-
-
-#if USE_MPI   
-   fieldData = &Domain::nodalMass ;
-
-   // Initial domain boundary communication 
-   CommRecv(*locDom, MSG_COMM_SBN, 1,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
-            true, false) ;
-   CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
-            true, false) ;
-   CommSBN(*locDom, 1, &fieldData) ;
-
-   // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
-#endif   
-   
-   // BEGIN timestep to solution */
-#ifdef RAJA_USE_CALIPER
-   RAJA::Timer timer_main; 
-   timer_main.start("timer_main");
-#else
-#if USE_MPI   
-   double start = MPI_Wtime();
-#else
-   timeval start;
-   gettimeofday(&start, NULL) ;
-#endif
-#endif
-//debug to see region sizes
-// for(Int_t i = 0; i < locDom->numReg(); i++) {
-//    std::cout << "region " << i + 1<< " size = " << locDom->regElemSize(i) << std::endl;
-//    RAJA::forall<mat_exec_policy>(locDom->getRegionISet(i), [=] (int idx) { printf("%d ", idx) ; }) ;
-//    printf("\n\n") ;
-// }
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
-
-      TimeIncrement(*locDom) ;
-      LagrangeLeapFrog(locDom) ;
-
-      if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
-      }
-   }
-
-   // Use reduced max elapsed time
-   double elapsed_time;
-#ifdef RAJA_USE_CALIPER
-   // Use reduced max elapsed time
-   timer_main.stop("timer_main");
-   elapsed_time = timer_main.elapsed();
-#else
-#if USE_MPI   
-   elapsed_time = MPI_Wtime() - start;
-#else
-   timeval end;
-   gettimeofday(&end, NULL) ;
-   elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
-#endif
-#endif
-   double elapsed_timeG;
-#if USE_MPI   
-   MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-   elapsed_timeG = elapsed_time;
-#endif
-
-   // Write out final viz file */
-   if (opts.viz) {
-      DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
-   }
-   
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
-   }
-
-   delete locDom;
-
-#if USE_MPI
-   MPI_Finalize() ;
-#endif
-
-   return 0 ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.h
deleted file mode 100644
index 281a0a399..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh.h
+++ /dev/null
@@ -1,41 +0,0 @@
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   Policies for hybrid segment iteration and segment execution.
-//
-//   NOTE: Currently, we apply single policy across all loops
-//         with same iteration pattern.
-//
-typedef RAJA::seq_segit              IndexSet_SegIt;
-//typedef RAJA::omp_parallel_for_segit IndexSet_SegIt;
-//typedef RAJA::cilk_for_segit         IndexSet_SegIt;
-
-
-//typedef RAJA::seq_exec              SegExec;
-//typedef RAJA::simd_exec             SegExec;
-typedef RAJA::omp_parallel_for_exec SegExec;
-//typedef RAJA::cilk_for_exec         SegExec;
-
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> node_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> elem_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> mat_exec_policy;
-//typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, RAJA::seq_exec> mat_exec_policy;
-// typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> minloc_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> min_exec_policy;
-typedef RAJA::IndexSet::ExecPolicy<IndexSet_SegIt, SegExec> symnode_exec_policy;
-
-//typedef RAJA::seq_reduce              reduce_policy;
-typedef RAJA::omp_reduce              reduce_policy;
-//typedef RAJA::cilk_reduce              reduce_policy;
-
-
-#if !defined(LULESH_HEADER)
-#include "lulesh_stl.h"
-#elif (LULESH_HEADER == 1)
-#include "lulesh_ptr.h"
-#elif (LULESH_HEADER == 2)
-#include "lulesh_raw.h"
-#else
-#include "lulesh_tuple.h"
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_ptr.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_ptr.h
deleted file mode 100644
index 8fb69ddb4..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_ptr.h
+++ /dev/null
@@ -1,692 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-typedef Real_t * __restrict__ Real_p ;
-typedef Index_t * __restrict__ Index_p ;
-typedef Int_t * __restrict__ Int_p ;
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Helper functions
-//////////////////////////////////////////////////////
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Index_t numNode) // Node-centered
-   {
-      m_x = Allocate<Real_t>(numNode) ; // coordinates
-      m_y = Allocate<Real_t>(numNode) ;
-      m_z = Allocate<Real_t>(numNode) ;
-
-      m_xd = Allocate<Real_t>(numNode) ; // velocities
-      m_yd = Allocate<Real_t>(numNode) ;
-      m_zd = Allocate<Real_t>(numNode) ;
-
-      m_xdd = Allocate<Real_t>(numNode) ; // accelerations
-      m_ydd = Allocate<Real_t>(numNode) ;
-      m_zdd = Allocate<Real_t>(numNode) ;
-
-      m_fx = Allocate<Real_t>(numNode) ; // forces
-      m_fy = Allocate<Real_t>(numNode) ;
-      m_fz = Allocate<Real_t>(numNode) ;
-
-      m_nodalMass = Allocate<Real_t>(numNode) ; // mass
-   }
-
-   void AllocateElemPersistent(Index_t numElem) // Elem-centered
-   {
-      m_nodelist = Allocate<Index_t>(8*numElem) ;
-
-      // elem connectivities through face
-      m_lxim = Allocate<Index_t>(numElem) ;
-      m_lxip = Allocate<Index_t>(numElem) ;
-      m_letam = Allocate<Index_t>(numElem) ;
-      m_letap = Allocate<Index_t>(numElem) ;
-      m_lzetam = Allocate<Index_t>(numElem) ;
-      m_lzetap = Allocate<Index_t>(numElem) ;
-
-      m_elemBC = Allocate<Int_t>(numElem) ;
-
-      m_e = Allocate<Real_t>(numElem) ;
-      m_p = Allocate<Real_t>(numElem) ;
-
-      m_q = Allocate<Real_t>(numElem) ;
-      m_ql = Allocate<Real_t>(numElem) ;
-      m_qq = Allocate<Real_t>(numElem) ;
-
-      m_v = Allocate<Real_t>(numElem) ;
-
-      m_volo = Allocate<Real_t>(numElem) ;
-      m_delv = Allocate<Real_t>(numElem) ;
-      m_vdov = Allocate<Real_t>(numElem) ;
-
-      m_arealg = Allocate<Real_t>(numElem) ;
-
-      m_ss = Allocate<Real_t>(numElem) ;
-
-      m_elemMass = Allocate<Real_t>(numElem) ;
-
-      m_vnew = Allocate<Real_t>(numElem) ;
-   }
-
-   void AllocateGradients(Index_t numElem, Index_t allElem)
-   {
-      // Position gradients
-      m_delx_xi = Allocate<Real_t>(numElem) ;
-      m_delx_eta = Allocate<Real_t>(numElem) ;
-      m_delx_zeta = Allocate<Real_t>(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi = Allocate<Real_t>(allElem) ;
-      m_delv_eta = Allocate<Real_t>(allElem) ;
-      m_delv_zeta = Allocate<Real_t>(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      Release(&m_delv_zeta) ;
-      Release(&m_delv_eta) ;
-      Release(&m_delv_xi) ;
-
-      Release(&m_delx_zeta) ;
-      Release(&m_delx_eta) ;
-      Release(&m_delx_xi) ;
-   }
-
-   void AllocateStrains(Index_t numElem)
-   {
-      m_dxx = Allocate<Real_t>(numElem) ;
-      m_dyy = Allocate<Real_t>(numElem) ;
-      m_dzz = Allocate<Real_t>(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      Release(&m_dzz) ;
-      Release(&m_dyy) ;
-      Release(&m_dxx) ;
-   }
-
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_p  nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_p nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_p   regNumList()            { return &m_regNumList[0] ; }
-   Index_p   regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()  { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()  { return m_domElemISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_p commDataSend ;
-   Real_p commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-   Real_p m_x ;  /* coordinates */
-   Real_p m_y ;
-   Real_p m_z ;
-
-   Real_p m_xd ; /* velocities */
-   Real_p m_yd ;
-   Real_p m_zd ;
-
-   Real_p m_xdd ; /* accelerations */
-   Real_p m_ydd ;
-   Real_p m_zdd ;
-
-   Real_p m_fx ;  /* forces */
-   Real_p m_fy ;
-   Real_p m_fz ;
-
-   Real_p m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   Index_p  m_nodelist ;     /* elemToNode connectivity */
-
-   Index_p  m_lxim ;  /* element connectivity across each face */
-   Index_p  m_lxip ;
-   Index_p  m_letam ;
-   Index_p  m_letap ;
-   Index_p  m_lzetam ;
-   Index_p  m_lzetap ;
-
-   Int_p    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   Real_p m_dxx ;  /* principal strains -- temporary */
-   Real_p m_dyy ;
-   Real_p m_dzz ;
-
-   Real_p m_delv_xi ;    /* velocity gradient -- temporary */
-   Real_p m_delv_eta ;
-   Real_p m_delv_zeta ;
-
-   Real_p m_delx_xi ;    /* coordinate gradient -- temporary */
-   Real_p m_delx_eta ;
-   Real_p m_delx_zeta ;
-
-   Real_p m_e ;   /* energy */
-
-   Real_p m_p ;   /* pressure */
-   Real_p m_q ;   /* q */
-   Real_p m_ql ;  /* linear term for q */
-   Real_p m_qq ;  /* quadratic term for q */
-
-   Real_p m_v ;     /* relative volume */
-   Real_p m_volo ;  /* reference volume */
-   Real_p m_vnew ;  /* new relative volume -- temporary */
-   Real_p m_delv ;  /* m_vnew - m_v */
-   Real_p m_vdov ;  /* volume derivative over volume */
-
-   Real_p m_arealg ;  /* characteristic length of an element */
-
-   Real_p m_ss ;      /* "sound speed" */
-
-   Real_p m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_p m_regElemSize ;   // Size of region sets
-   Index_p m_regNumList ;    // Region number per domain element
-   Index_p *m_regElemlist ;  // region indexset
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_p m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_p m_nodeElemStart ;
-   Index_p m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_raw.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_raw.h
deleted file mode 100644
index cb1568bbf..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_raw.h
+++ /dev/null
@@ -1,590 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-typedef Real_t * __restrict__ Real_p ;
-typedef Index_t * __restrict__ Index_p ;
-typedef Int_t * __restrict__ Int_p ;
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Helper functions
-//////////////////////////////////////////////////////
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Index_t numNode) // Node-centered
-   {
-      x = Allocate<Real_t>(numNode) ; // coordinates
-      y = Allocate<Real_t>(numNode) ;
-      z = Allocate<Real_t>(numNode) ;
-
-      xd = Allocate<Real_t>(numNode) ; // velocities
-      yd = Allocate<Real_t>(numNode) ;
-      zd = Allocate<Real_t>(numNode) ;
-
-      xdd = Allocate<Real_t>(numNode) ; // accelerations
-      ydd = Allocate<Real_t>(numNode) ;
-      zdd = Allocate<Real_t>(numNode) ;
-
-      fx = Allocate<Real_t>(numNode) ; // forces
-      fy = Allocate<Real_t>(numNode) ;
-      fz = Allocate<Real_t>(numNode) ;
-
-      nodalMass = Allocate<Real_t>(numNode) ; // mass
-   }
-
-   void AllocateElemPersistent(Index_t numElem) // Elem-centered
-   {
-      nodelist = Allocate<Index_t>(8*numElem) ;
-
-      // elem connectivities through face
-      lxim = Allocate<Index_t>(numElem) ;
-      lxip = Allocate<Index_t>(numElem) ;
-      letam = Allocate<Index_t>(numElem) ;
-      letap = Allocate<Index_t>(numElem) ;
-      lzetam = Allocate<Index_t>(numElem) ;
-      lzetap = Allocate<Index_t>(numElem) ;
-
-      elemBC = Allocate<Int_t>(numElem) ;
-
-      e = Allocate<Real_t>(numElem) ;
-      p = Allocate<Real_t>(numElem) ;
-
-      q = Allocate<Real_t>(numElem) ;
-      ql = Allocate<Real_t>(numElem) ;
-      qq = Allocate<Real_t>(numElem) ;
-
-      v = Allocate<Real_t>(numElem) ;
-
-      volo = Allocate<Real_t>(numElem) ;
-      delv = Allocate<Real_t>(numElem) ;
-      vdov = Allocate<Real_t>(numElem) ;
-
-      arealg = Allocate<Real_t>(numElem) ;
-
-      ss = Allocate<Real_t>(numElem) ;
-
-      elemMass = Allocate<Real_t>(numElem) ;
-
-      vnew = Allocate<Real_t>(numElem) ;
-   }
-
-   void AllocateGradients(Index_t numElem, Index_t allElem)
-   {
-      // Position gradients
-      delx_xi = Allocate<Real_t>(numElem) ;
-      delx_eta = Allocate<Real_t>(numElem) ;
-      delx_zeta = Allocate<Real_t>(numElem) ;
-
-      // Velocity gradients
-      delv_xi = Allocate<Real_t>(allElem) ;
-      delv_eta = Allocate<Real_t>(allElem) ;
-      delv_zeta = Allocate<Real_t>(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      Release(&delv_zeta) ;
-      Release(&delv_eta) ;
-      Release(&delv_xi) ;
-
-      Release(&delx_zeta) ;
-      Release(&delx_eta) ;
-      Release(&delx_xi) ;
-   }
-
-   void AllocateStrains(Index_t numElem)
-   {
-      dxx = Allocate<Real_t>(numElem) ;
-      dyy = Allocate<Real_t>(numElem) ;
-      dzz = Allocate<Real_t>(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      Release(&dzz) ;
-      Release(&dyy) ;
-      Release(&dxx) ;
-   }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_p nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_p   regNumList()            { return &m_regNumList[0] ; }
-   Index_p   regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()  { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()  { return m_domElemISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_p commDataSend ;
-   Real_p commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-   Real_p x ;  /* coordinates */
-   Real_p y ;
-   Real_p z ;
-
-   Real_p xd ; /* velocities */
-   Real_p yd ;
-   Real_p zd ;
-
-   Real_p xdd ; /* accelerations */
-   Real_p ydd ;
-   Real_p zdd ;
-
-   Real_p fx ;  /* forces */
-   Real_p fy ;
-   Real_p fz ;
-
-   Real_p nodalMass ;  /* mass */
-
-   // Element-centered
-
-   Index_p  nodelist ;     /* elemToNode connectivity */
-
-   Index_p  lxim ;  /* element connectivity across each face */
-   Index_p  lxip ;
-   Index_p  letam ;
-   Index_p  letap ;
-   Index_p  lzetam ;
-   Index_p  lzetap ;
-
-   Int_p    elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   Real_p dxx ;  /* principal strains -- temporary */
-   Real_p dyy ;
-   Real_p dzz ;
-
-   Real_p delv_xi ;    /* velocity gradient -- temporary */
-   Real_p delv_eta ;
-   Real_p delv_zeta ;
-
-   Real_p delx_xi ;    /* coordinate gradient -- temporary */
-   Real_p delx_eta ;
-   Real_p delx_zeta ;
-
-   Real_p e ;   /* energy */
-
-   Real_p p ;   /* pressure */
-   Real_p q ;   /* q */
-   Real_p ql ;  /* linear term for q */
-   Real_p qq ;  /* quadratic term for q */
-
-   Real_p v ;     /* relative volume */
-   Real_p volo ;  /* reference volume */
-   Real_p vnew ;  /* new relative volume -- temporary */
-   Real_p delv ;  /* m_vnew - m_v */
-   Real_p vdov ;  /* volume derivative over volume */
-
-   Real_p arealg ;  /* characteristic length of an element */
-
-   Real_p ss ;      /* "sound speed" */
-
-   Real_p elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_p m_regElemSize ;   // Size of region sets
-   Index_p m_regNumList ;    // Region number per domain element
-   Index_p *m_regElemlist ;  // region indexset
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_p m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_p m_nodeElemStart ;
-   Index_p m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-   private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_stl.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_stl.h
deleted file mode 100644
index df15b92f7..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_stl.h
+++ /dev/null
@@ -1,679 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Helper functions
-//////////////////////////////////////////////////////
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_x.reserve(numNode);  // coordinates
-      m_y.reserve(numNode);
-      m_z.reserve(numNode);
-
-      m_xd.reserve(numNode); // velocities
-      m_yd.reserve(numNode);
-      m_zd.reserve(numNode);
-
-      m_xdd.reserve(numNode); // accelerations
-      m_ydd.reserve(numNode);
-      m_zdd.reserve(numNode);
-
-      m_fx.reserve(numNode);  // forces
-      m_fy.reserve(numNode);
-      m_fz.reserve(numNode);
-
-      m_nodalMass.reserve(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.reserve(8*numElem);
-
-      // elem connectivities through face
-      m_lxim.reserve(numElem);
-      m_lxip.reserve(numElem);
-      m_letam.reserve(numElem);
-      m_letap.reserve(numElem);
-      m_lzetam.reserve(numElem);
-      m_lzetap.reserve(numElem);
-
-      m_elemBC.reserve(numElem);
-
-      m_e.reserve(numElem);
-      m_p.reserve(numElem);
-
-      m_q.reserve(numElem);
-      m_ql.reserve(numElem);
-      m_qq.reserve(numElem);
-
-      m_v.reserve(numElem);
-
-      m_volo.reserve(numElem);
-      m_delv.reserve(numElem);
-      m_vdov.reserve(numElem);
-
-      m_arealg.reserve(numElem);
-
-      m_ss.reserve(numElem);
-
-      m_elemMass.reserve(numElem);
-
-      m_vnew.reserve(numElem) ;
-   }
-
-   void AllocateGradients(Int_t numElem, Int_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.reserve(numElem) ;
-      m_delx_eta.reserve(numElem) ;
-      m_delx_zeta.reserve(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.reserve(allElem) ;
-      m_delv_eta.reserve(allElem);
-      m_delv_zeta.reserve(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Int_t numElem)
-   {
-      m_dxx.reserve(numElem) ;
-      m_dyy.reserve(numElem) ;
-      m_dzz.reserve(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_t*  nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()  { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()  { return m_domElemISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_t *m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_tuple.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_tuple.h
deleted file mode 100644
index 9ca796ac0..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/lulesh_tuple.h
+++ /dev/null
@@ -1,649 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//
-//   RAJA IndexSet type used in loop traversals.
-//
-typedef RAJA::IndexSet LULESH_ISET;
-
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_coord.reserve(numNode);  // coordinates
-
-      m_vel.reserve(numNode); // velocities
-
-      m_acc.reserve(numNode); // accelerations
-
-      m_force.reserve(numNode);  // forces
-
-      m_nodalMass.reserve(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.reserve(8*numElem);
-
-      // elem connectivities through face
-      m_faceToElem.reserve(numElem);
-
-      m_elemBC.reserve(numElem);
-
-      m_e.reserve(numElem);
-
-      m_pq.reserve(numElem);
-
-      m_qlqq.reserve(numElem);
-
-      m_vol.reserve(numElem);
-
-      m_delv.reserve(numElem);
-      m_vdov.reserve(numElem);
-
-      m_arealg.reserve(numElem);
-
-      m_ss.reserve(numElem);
-
-      m_elemMass.reserve(numElem);
-
-      m_vnew.reserve(numElem) ;
-   }
-
-   void AllocateGradients(Int_t numElem, Int_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.reserve(numElem) ;
-      m_delx_eta.reserve(numElem) ;
-      m_delx_zeta.reserve(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.reserve(allElem) ;
-      m_delv_eta.reserve(allElem);
-      m_delv_zeta.reserve(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Int_t numElem)
-   {
-      m_dxx.reserve(numElem) ;
-      m_dyy.reserve(numElem) ;
-      m_dzz.reserve(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_coord[idx].x ; }
-   Real_t& y(Index_t idx)    { return m_coord[idx].y ; }
-   Real_t& z(Index_t idx)    { return m_coord[idx].z ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_vel[idx].x ; }
-   Real_t& yd(Index_t idx)   { return m_vel[idx].y ; }
-   Real_t& zd(Index_t idx)   { return m_vel[idx].z ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_acc[idx].x ; }
-   Real_t& ydd(Index_t idx)  { return m_acc[idx].y ; }
-   Real_t& zdd(Index_t idx)  { return m_acc[idx].z ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_force[idx].x ; }
-   Real_t& fy(Index_t idx)   { return m_force[idx].y ; }
-   Real_t& fz(Index_t idx)   { return m_force[idx].z ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   //
-   // Element-centered
-   //
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-#if !defined(LULESH_LIST_INDEXSET)
-   Index_t&  perm(Index_t idx)     { return m_perm[idx] ; }
-#else
-   Index_t  perm(Index_t idx)     { return idx ; }
-#endif
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_faceToElem[idx].lxim ; }
-   Index_t&  lxip(Index_t idx) { return m_faceToElem[idx].lxip ; }
-   Index_t&  letam(Index_t idx) { return m_faceToElem[idx].letam ; }
-   Index_t&  letap(Index_t idx) { return m_faceToElem[idx].letap ; }
-   Index_t&  lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; }
-   Index_t&  lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_pq[idx].p ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_pq[idx].q ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_qlqq[idx].ql ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qlqq[idx].qq ; }
-
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_vol[idx].v ; }
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_vol[idx].volo ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Region Centered
-
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-
-   //
-   // Accessors for index sets
-   //
-   LULESH_ISET& getNodeISet()  { return m_domNodeISet ; }
-   LULESH_ISET& getElemISet()  { return m_domElemISet ; }
-
-   LULESH_ISET& getRegionISet(int r) { return m_domRegISet[r] ; }
-
-   LULESH_ISET& getXSymNodeISet() { return m_domXSymNodeISet ; }
-   LULESH_ISET& getYSymNodeISet() { return m_domYSymNodeISet ; }
-   LULESH_ISET& getZSymNodeISet() { return m_domZSymNodeISet ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMeshTopology(Index_t edgeNodes, Index_t edgeElems);
-   void BuildMeshCoordinates(Index_t nx, Index_t edgeNodes);
-   void SetupThreadSupportStructures();
-   void CreateMeshIndexSets();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void CreateSymmetryIndexSets(Index_t edgeNodes);
-   void SetupCommBuffers(Index_t edgeNodes);
-   void SetupElementConnectivities(Index_t edgeElems);
-   void SetupBoundaryConditions(Index_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* mesh-based index sets */
-   LULESH_ISET m_domNodeISet ;
-   LULESH_ISET m_domElemISet ;
-
-   LULESH_ISET m_domXSymNodeISet ;
-   LULESH_ISET m_domYSymNodeISet ;
-   LULESH_ISET m_domZSymNodeISet ;
-
-   /* region-based index sets */
-   std::vector<LULESH_ISET> m_domRegISet;
-
-   /* Node-centered */
-
-   struct Tuple3 {
-      Real_t x, y, z ;
-   } ;
-
-   std::vector<Tuple3> m_coord ;  /* coordinates */
-
-   std::vector<Tuple3> m_vel ; /* velocities */
-
-   std::vector<Tuple3> m_acc ; /* accelerations */
-
-   std::vector<Tuple3> m_force ;  /* forces */
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   // Element-centered
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   struct FaceElemConn {
-      Index_t lxim, lxip, letam, letap, lzetam, lzetap ;
-   } ;
-
-   std::vector<FaceElemConn> m_faceToElem ; /* element conn across faces */
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   struct Pcomponents {
-      Real_t p, q ;
-   } ;
-
-   std::vector<Pcomponents> m_pq ;   /* pressure and artificial viscosity */
-
-   struct Qcomponents {
-      Real_t ql, qq ;
-   } ;
-
-   std::vector<Qcomponents> m_qlqq ;  /* linear and quadratic terms for q */
-
-   struct Volume {
-      Real_t v, volo ;
-   } ;
-
-   std::vector<Volume> m_vol ;     /* relative and reference volume */
-
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   // Permutation to pack element-centered material subsets
-   // into a contiguous range per material
-   Index_t *m_perm ;
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/subs b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/subs
deleted file mode 100644
index b42cc428c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-MICfriendly/subs
+++ /dev/null
@@ -1,10 +0,0 @@
-%s/()/##/g
-%s/domain->\([A-Za-z0-9]\+\)(\([A-Za-z0-9]\+\))/domain->\1[\2]/g
-%s/domain->\([_A-Za-z0-9]\+\)(\([_A-Za-z0-9]\+\))/domain->\1[\2]/g
-%s/getRegionISet\[\([_A-Za-z0-9]\+\)\]/getRegionISet(\1)/g
-%s/nodelist\[/nodelist\[8*/g
-# search for AllocateStrains
-%s/##/()/g
-
-%s/domain\.\([A-Za-z0-9]\+\)(\([A-Za-z0-9]\+\))/domain\.\1[\2]/g
-%s/domain\.\([_A-Za-z0-9]\+\)(\([_A-Za-z0-9]\+\))/domain\.\1[\2]/g
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/CMakeLists.txt b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/CMakeLists.txt
deleted file mode 100644
index 60de31cf1..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/CMakeLists.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-add_definitions(-DUSE_MPI=0 -DUSE_OMP=1)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-if (RAJA_ENABLE_CUDA)
-  cuda_add_executable(lulesh2.0basic.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0basic.exe RAJA ${RT_LIBARRIES})
-elseif (RAJA_ENABLE_OPENMP)
-  add_executable(lulesh2.0basic.exe
-    lulesh.cc
-    lulesh-comm.cc
-    lulesh-init.cc
-    lulesh-util.cc
-    lulesh-viz.cc)
-  target_link_libraries(lulesh2.0basic.exe RAJA ${RT_LIBRARIES})
-endif()
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/RAJAspecial.hxx b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/RAJAspecial.hxx
deleted file mode 100644
index 90b339dc7..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/RAJAspecial.hxx
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef RAJAspecial_HXX
-#define RAJAspecial_HXX
-
-#include <omp.h>
-
-namespace RAJA {
-
-/*!
- ***************************************************************************** 
- *
- * \brief  Traverse contiguous range of indices using OpenMP for with
- *         nowait clause (assumes loop appears in a parallel region).
- *
- *****************************************************************************
- */
-
-struct omp_for_nowait_exec {};
-
-template <typename LOOP_BODY>
-inline  __attribute__((always_inline))
-void forall(omp_for_nowait_exec,
-            const int begin, const int end,
-            LOOP_BODY loop_body)
-{
-//#pragma omp for nowait schedule(static)
-#pragma omp for nowait
-   for ( int ii = begin ; ii < end ; ++ii ) {
-      loop_body( ii );
-   }
-}
-
-
-}  // closing brace for RAJA namespace
-
-#endif  // closing endif for header file include guard
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/README b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/README
deleted file mode 100644
index 8b0f260ba..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/README
+++ /dev/null
@@ -1,53 +0,0 @@
-This is the README for LULESH 2.0
-
-More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php
-
-If you have any questions or problems please contact:
-
-Ian Karlin <karlin1@llnl.gov>
-Jeff Keasler <keasler1@llnl.gov> or
-Rob Neely <neely4@llnl.gov>
-
-Also please send any notable results to Ian Karlin <karlin1@llnl.gov> as we are still evaluating the performance of this code.
-
-*** Notable changes in LULESH 2.0 ***
-
-Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-
-The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative:
-
-Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride
-
-Artificial load imbalances can be easily introduced that could impact parallelization strategies.  
-   * The load balance flag changes region assignment.  Region number is raised to the power entered for assignment probability.  Most likely regions changes with MPI process id.
-   * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple.  The cost of 5% is 10x the entered
- multiple.
-
-MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-
-Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt.
-
-Enabled variable timestep calculation by default (courant condition), which results in an additional reduction.  Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size.  Therefore steps to solution will differ from LULESH 1.0.
-
-Default domain (mesh) size reduced from 45^3 to 30^3
-
-Command line options to allow for numerous test cases without needing to recompile
-
-Performance optimizations and code cleanup uncovered during study of LULESH 1.0
-
-Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement
-
-Possible Future 2.0 minor updates (other changes possible as discovered)
-
-* Different default parameters
-* Minor code performance changes and cleanupS
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-comm.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-comm.cc
deleted file mode 100644
index a30c3ec1c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-comm.cc
+++ /dev/null
@@ -1,1837 +0,0 @@
-#include "lulesh.h"
-
-// If no MPI, then this whole file is stubbed out
-#if USE_MPI
-
-#include <mpi.h>
-#include <string.h>
-
-/* Comm Routines */
-
-#define ALLOW_UNPACKED_PLANE false
-#define ALLOW_UNPACKED_ROW   false
-#define ALLOW_UNPACKED_COL   false
-
-/*
-   There are coherence issues for packing and unpacking message
-   buffers.  Ideally, you would like a lot of threads to 
-   cooperate in the assembly/dissassembly of each message.
-   To do that, each thread should really be operating in a
-   different coherence zone.
-
-   Let's assume we have three fields, f1 through f3, defined on
-   a 61x61x61 cube.  If we want to send the block boundary
-   information for each field to each neighbor processor across
-   each cube face, then we have three cases for the
-   memory layout/coherence of data on each of the six cube
-   boundaries:
-
-      (a) Two of the faces will be in contiguous memory blocks
-      (b) Two of the faces will be comprised of pencils of
-          contiguous memory.
-      (c) Two of the faces will have large strides between
-          every value living on the face.
-
-   How do you pack and unpack this data in buffers to
-   simultaneous achieve the best memory efficiency and
-   the most thread independence?
-
-   Do do you pack field f1 through f3 tighly to reduce message
-   size?  Do you align each field on a cache coherence boundary
-   within the message so that threads can pack and unpack each
-   field independently?  For case (b), do you align each
-   boundary pencil of each field separately?  This increases
-   the message size, but could improve cache coherence so
-   each pencil could be processed independently by a separate
-   thread with no conflicts.
-
-   Also, memory access for case (c) would best be done without
-   going through the cache (the stride is so large it just causes
-   a lot of useless cache evictions).  Is it worth creating
-   a special case version of the packing algorithm that uses
-   non-coherent load/store opcodes?
-*/
-
-/******************************************/
-
-
-/* doRecv flag only works with regular block structure */
-void CommRecv(Domain& domain, int msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.recvRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post receives */
-
-   /* receive data from neighboring domain faces */
-   if (planeMin && doRecv) {
-      /* contiguous memory */
-      int fromRank = myRank - domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (planeMax) {
-      /* contiguous memory */
-      int fromRank = myRank + domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMin && doRecv) {
-      /* semi-contiguous memory */
-      int fromRank = myRank - domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMax) {
-      /* semi-contiguous memory */
-      int fromRank = myRank + domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMin && doRecv) {
-      /* scattered memory */
-      int fromRank = myRank - 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMax) {
-      /* scattered memory */
-      int fromRank = myRank + 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-
-   if (!planeOnly) {
-      /* receive data from domains connected only by an edge */
-      if (rowMin && colMin && doRecv) {
-         int fromRank = myRank - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax) {
-         int fromRank = myRank + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin) {
-         int fromRank = myRank + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax && doRecv) {
-         int fromRank = myRank - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      /* receive data from domains connected only by a corner */
-      if (rowMin && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-}
-
-/******************************************/
-
-void CommSend(Domain& domain, int msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
-{
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   MPI_Status status[26] ;
-   Real_t *destAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.sendRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post sends */
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dy ;
-
-      if (planeMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (planeMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dz ;
-
-      if (rowMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (rowMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dy * dz ;
-
-      if (colMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (colMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-
-   if (!planeOnly) {
-      if (rowMin && colMin) {
-         int toRank = myRank - domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax && doSend) {
-         int toRank = myRank + domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-              destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin && doSend) {
-         int toRank = myRank + domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax) {
-         int toRank = myRank - domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMin && planeMin) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(0) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*dz - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-
-   MPI_Waitall(26, domain.sendRequest, status) ;
-}
-
-/******************************************/
-
-void CommSBN(Domain& domain, int xferFields, Domain_member *fieldData) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* summation order should be from smallest value to largest */
-   /* or we could try out kahan summation! */
-
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
-   if (domain.rowLoc() == 0) {
-      rowMin = 0 ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = 0 ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = 0 ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = 0 ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = 0 ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = 0 ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMin & planeMin) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMin & planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMin) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMin) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMin) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommSyncPosVel(Domain& domain) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   bool doRecv = false ;
-   Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
-   Domain_member fieldData[6] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin && colMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && colMax && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-
-   if (rowMin && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMin && planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommMonoQ(Domain& domain)
-{
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
-   Domain_member fieldData[3] ;
-   Index_t fieldOffset[3] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t dx = domain.sizeX() ;
-   Index_t dy = domain.sizeY() ;
-   Index_t dz = domain.sizeZ() ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   /* point into ghost data area */
-   // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
-   // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
-   // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
-   fieldData[0] = &Domain::delv_xi ;
-   fieldData[1] = &Domain::delv_eta ;
-   fieldData[2] = &Domain::delv_zeta ;
-   fieldOffset[0] = domain.numElem() ;
-   fieldOffset[1] = domain.numElem() ;
-   fieldOffset[2] = domain.numElem() ;
-
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-}
-
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-init.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-init.cc
deleted file mode 100644
index cee9dc3e6..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-init.cc
+++ /dev/null
@@ -1,739 +0,0 @@
-#include <math.h>
-#if USE_MPI
-# include <mpi.h>
-#endif
-#if USE_OMP
-#include <omp.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <cstdlib>
-#include "lulesh.h"
-
-/////////////////////////////////////////////////////////////////////
-Domain::Domain(Int_t numRanks, Index_t colLoc,
-               Index_t rowLoc, Index_t planeLoc,
-               Index_t nx, int tp, int nr, int balance, Int_t cost)
-   :
-   m_e_cut(Real_t(1.0e-7)),
-   m_p_cut(Real_t(1.0e-7)),
-   m_q_cut(Real_t(1.0e-7)),
-   m_v_cut(Real_t(1.0e-10)),
-   m_u_cut(Real_t(1.0e-7)),
-   m_hgcoef(Real_t(3.0)),
-   m_ss4o3(Real_t(4.0)/Real_t(3.0)),
-   m_qstop(Real_t(1.0e+12)),
-   m_monoq_max_slope(Real_t(1.0)),
-   m_monoq_limiter_mult(Real_t(2.0)),
-   m_qlc_monoq(Real_t(0.5)),
-   m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
-   m_qqc(Real_t(2.0)),
-   m_eosvmax(Real_t(1.0e+9)),
-   m_eosvmin(Real_t(1.0e-9)),
-   m_pmin(Real_t(0.)),
-   m_emin(Real_t(-1.0e+15)),
-   m_dvovmax(Real_t(0.1)),
-   m_refdens(Real_t(1.0)),
-//
-// set pointers to (potentially) "new'd" arrays to null to 
-// simplify deallocation.
-//
-   m_regNumList(0),
-   m_nodeElemStart(0),
-   m_nodeElemCornerList(0),
-   m_regElemSize(0),
-   m_regElemlist(0)
-#if USE_MPI
-   , 
-   commDataSend(0),
-   commDataRecv(0)
-#endif
-{
-
-   Index_t edgeElems = nx ;
-   Index_t edgeNodes = edgeElems+1 ;
-   this->cost() = cost;
-
-   m_tp       = tp ;
-   m_numRanks = numRanks ;
-
-   ///////////////////////////////
-   //   Initialize Sedov Mesh
-   ///////////////////////////////
-
-   // construct a uniform box for this processor
-
-   m_colLoc   =   colLoc ;
-   m_rowLoc   =   rowLoc ;
-   m_planeLoc = planeLoc ;
-   
-   m_sizeX = edgeElems ;
-   m_sizeY = edgeElems ;
-   m_sizeZ = edgeElems ;
-   m_numElem = edgeElems*edgeElems*edgeElems ;
-
-   m_numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   m_regNumList = new Index_t[numElem()] ;  // material indexset
-
-   // Elem-centered 
-   AllocateElemPersistent(numElem()) ;
-
-   // Node-centered 
-   AllocateNodePersistent(numNode()) ;
-
-   SetupCommBuffers(edgeNodes);
-
-   // Basic Field Initialization 
-   for (Index_t i=0; i<numElem(); ++i) {
-      e(i) =  Real_t(0.0) ;
-      p(i) =  Real_t(0.0) ;
-      q(i) =  Real_t(0.0) ;
-      ss(i) = Real_t(0.0) ;
-   }
-
-   // Note - v initializes to 1.0, not 0.0!
-   for (Index_t i=0; i<numElem(); ++i) {
-      v(i) = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      xd(i) = Real_t(0.0) ;
-      yd(i) = Real_t(0.0) ;
-      zd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      xdd(i) = Real_t(0.0) ;
-      ydd(i) = Real_t(0.0) ;
-      zdd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      nodalMass(i) = Real_t(0.0) ;
-   }
-
-   BuildMesh(nx, edgeNodes, edgeElems);
-
-#if USE_OMP
-   SetupThreadSupportStructures();
-#endif
-
-   // Setup region index sets. For now, these are constant sized
-   // throughout the run, but could be changed every cycle to 
-   // simulate effects of ALE on the lagrange solver
-   CreateRegionIndexSets(nr, balance);
-
-   // Setup symmetry nodesets
-   SetupSymmetryPlanes(edgeNodes);
-
-   // Setup element connectivities
-   SetupElementConnectivities(edgeElems);
-
-   // Setup symmetry planes and free surface boundary arrays
-   SetupBoundaryConditions(edgeElems);
-
-
-   // Setup defaults
-
-   // These can be changed (requires recompile) if you want to run
-   // with a fixed timestep, or to a different end time, but it's
-   // probably easier/better to just run a fixed number of timesteps
-   // using the -i flag in 2.x
-
-   dtfixed() = Real_t(-1.0e-6) ; // Negative means use courant condition
-   stoptime()  = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ;
-
-   // Initial conditions
-   deltatimemultlb() = Real_t(1.1) ;
-   deltatimemultub() = Real_t(1.2) ;
-   dtcourant() = Real_t(1.0e+20) ;
-   dthydro()   = Real_t(1.0e+20) ;
-   dtmax()     = Real_t(1.0e-2) ;
-   time()    = Real_t(0.) ;
-   cycle()   = Int_t(0) ;
-
-   // initialize field data 
-   for (Index_t i=0; i<numElem(); ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = x(gnode);
-        y_local[lnode] = y(gnode);
-        z_local[lnode] = z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      volo(i) = volume ;
-      elemMass(i) = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         nodalMass(idx) += volume / Real_t(8.0) ;
-      }
-   }
-
-   // deposit initial energy
-   // An energy of 3.948746e+7 is correct for a problem with
-   // 45 zones along a side - we need to scale it
-   const Real_t ebase = Real_t(3.948746e+7);
-   Real_t scale = (nx*m_tp)/Real_t(45.0);
-   Real_t einit = ebase*scale*scale*scale;
-   if (m_rowLoc + m_colLoc + m_planeLoc == 0) {
-      // Dump into the first zone (which we know is in the corner)
-      // of the domain that sits at the origin
-      e(0) = einit;
-   }
-   //set initial deltatime base on analytic CFL calculation
-   deltatime() = (Real_t(.5)*cbrt(volo(0)))/sqrt(Real_t(2.0)*einit);
-
-} // End constructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-Domain::~Domain()
-{
-   delete [] m_regNumList;
-   delete [] m_nodeElemStart;
-   delete [] m_nodeElemCornerList;
-   delete [] m_regElemSize;
-   for (Index_t i=0 ; i<numReg() ; ++i) {
-     delete [] m_regElemlist[i];
-   }
-   delete [] m_regElemlist;
-   
-#if USE_MPI
-   delete [] commDataSend;
-   delete [] commDataRecv;
-#endif
-} // End destructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems)
-{
-  Index_t meshEdgeElems = m_tp*nx ;
-
-  // initialize nodal coordinates 
-  Index_t nidx = 0 ;
-  Real_t tz = Real_t(1.125)*Real_t(m_planeLoc*nx)/Real_t(meshEdgeElems) ;
-  for (Index_t plane=0; plane<edgeNodes; ++plane) {
-    Real_t ty = Real_t(1.125)*Real_t(m_rowLoc*nx)/Real_t(meshEdgeElems) ;
-    for (Index_t row=0; row<edgeNodes; ++row) {
-      Real_t tx = Real_t(1.125)*Real_t(m_colLoc*nx)/Real_t(meshEdgeElems) ;
-      for (Index_t col=0; col<edgeNodes; ++col) {
-	x(nidx) = tx ;
-	y(nidx) = ty ;
-	z(nidx) = tz ;
-	++nidx ;
-	// tx += ds ; // may accumulate roundoff... 
-	tx = Real_t(1.125)*Real_t(m_colLoc*nx+col+1)/Real_t(meshEdgeElems) ;
-      }
-      // ty += ds ;  // may accumulate roundoff... 
-      ty = Real_t(1.125)*Real_t(m_rowLoc*nx+row+1)/Real_t(meshEdgeElems) ;
-    }
-    // tz += ds ;  // may accumulate roundoff... 
-    tz = Real_t(1.125)*Real_t(m_planeLoc*nx+plane+1)/Real_t(meshEdgeElems) ;
-  }
-
-
-  // embed hexehedral elements in nodal point lattice 
-  Index_t zidx = 0 ;
-  nidx = 0 ;
-  for (Index_t plane=0; plane<edgeElems; ++plane) {
-    for (Index_t row=0; row<edgeElems; ++row) {
-      for (Index_t col=0; col<edgeElems; ++col) {
-	Index_t *localNode = nodelist(zidx) ;
-	localNode[0] = nidx                                       ;
-	localNode[1] = nidx                                   + 1 ;
-	localNode[2] = nidx                       + edgeNodes + 1 ;
-	localNode[3] = nidx                       + edgeNodes     ;
-	localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-	localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-	localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-	localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-	++zidx ;
-	++nidx ;
-      }
-      ++nidx ;
-    }
-    nidx += edgeNodes ;
-  }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupThreadSupportStructures()
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-  if (numthreads > 1) {
-    // set up node-centered indexing of elements 
-    Index_t *nodeElemCount = new Index_t[numNode()] ;
-
-    for (Index_t i=0; i<numNode(); ++i) {
-      nodeElemCount[i] = 0 ;
-    }
-
-    for (Index_t i=0; i<numElem(); ++i) {
-      Index_t *nl = nodelist(i) ;
-      for (Index_t j=0; j < 8; ++j) {
-	++(nodeElemCount[nl[j]] );
-      }
-    }
-
-    m_nodeElemStart = new Index_t[numNode()+1] ;
-
-    m_nodeElemStart[0] = 0;
-
-    for (Index_t i=1; i <= numNode(); ++i) {
-      m_nodeElemStart[i] =
-	m_nodeElemStart[i-1] + nodeElemCount[i-1] ;
-    }
-       
-    m_nodeElemCornerList = new Index_t[m_nodeElemStart[numNode()]];
-
-    for (Index_t i=0; i < numNode(); ++i) {
-      nodeElemCount[i] = 0;
-    }
-
-    for (Index_t i=0; i < numElem(); ++i) {
-      Index_t *nl = nodelist(i) ;
-      for (Index_t j=0; j < 8; ++j) {
-	Index_t m = nl[j];
-	Index_t k = i*8 + j ;
-	Index_t offset = m_nodeElemStart[m] + nodeElemCount[m] ;
-	m_nodeElemCornerList[offset] = k;
-	++(nodeElemCount[m]) ;
-      }
-    }
-
-    Index_t clSize = m_nodeElemStart[numNode()] ;
-    for (Index_t i=0; i < clSize; ++i) {
-      Index_t clv = m_nodeElemCornerList[i] ;
-      if ((clv < 0) || (clv > numElem()*8)) {
-	fprintf(stderr,
-		"AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-#if USE_MPI
-	MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-	exit(-1);
-#endif
-      }
-    }
-
-    delete [] nodeElemCount ;
-  }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupCommBuffers(Int_t edgeNodes)
-{
-  // allocate a buffer large enough for nodal ghost data 
-  Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ;
-  m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ;
-  m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ;
-
-  // assume communication to 6 neighbors by default 
-  m_rowMin = (m_rowLoc == 0)        ? 0 : 1;
-  m_rowMax = (m_rowLoc == m_tp-1)     ? 0 : 1;
-  m_colMin = (m_colLoc == 0)        ? 0 : 1;
-  m_colMax = (m_colLoc == m_tp-1)     ? 0 : 1;
-  m_planeMin = (m_planeLoc == 0)    ? 0 : 1;
-  m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1;
-
-#if USE_MPI   
-  // account for face communication 
-  Index_t comBufSize =
-    (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
-    m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for edge communication 
-  comBufSize +=
-    ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
-     (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
-     (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
-     (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
-    m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for corner communication 
-  // factor of 16 is so each buffer has its own cache line 
-  comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
-		 (m_rowMin & m_colMin & m_planeMax) +
-		 (m_rowMin & m_colMax & m_planeMin) +
-		 (m_rowMin & m_colMax & m_planeMax) +
-		 (m_rowMax & m_colMin & m_planeMin) +
-		 (m_rowMax & m_colMin & m_planeMax) +
-		 (m_rowMax & m_colMax & m_planeMin) +
-		 (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
-
-  this->commDataSend = new Real_t[comBufSize] ;
-  this->commDataRecv = new Real_t[comBufSize] ;
-  // prevent floating point exceptions 
-  memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ;
-  memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ;
-#endif   
-
-  // Boundary nodesets
-  if (m_colLoc == 0)
-    m_symmX.resize(edgeNodes*edgeNodes);
-  if (m_rowLoc == 0)
-    m_symmY.resize(edgeNodes*edgeNodes);
-  if (m_planeLoc == 0)
-    m_symmZ.resize(edgeNodes*edgeNodes);
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateRegionIndexSets(Int_t nr, Int_t balance)
-{
-#if USE_MPI   
-   Index_t myRank;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-   srand(myRank);
-#else
-   srand(0);
-   Index_t myRank = 0;
-#endif
-   this->numReg() = nr;
-   m_regElemSize = new Index_t[numReg()];
-   m_regElemlist = new Index_t*[numReg()];
-   Index_t nextIndex = 0;
-   //if we only have one region just fill it
-   // Fill out the regNumList with material numbers, which are always
-   // the region index plus one 
-   if(numReg() == 1) {
-      while (nextIndex < numElem()) {
-	 this->regNumList(nextIndex) = 1;
-         nextIndex++;
-      }
-      regElemSize(0) = 0;
-   }
-   //If we have more than one region distribute the elements.
-   else {
-      Int_t regionNum;
-      Int_t regionVar;
-      Int_t lastReg = -1;
-      Int_t binSize;
-      Index_t elements;
-      Index_t runto = 0;
-      Int_t costDenominator = 0;
-      Int_t* regBinEnd = new Int_t[numReg()];
-      //Determine the relative weights of all the regions.  This is based off the -b flag.  Balance is the value passed into b.  
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         regElemSize(i) = 0;
-	 costDenominator += pow((i+1), balance);  //Total sum of all regions weights
-	 regBinEnd[i] = costDenominator;  //Chance of hitting a given region is (regBinEnd[i] - regBinEdn[i-1])/costDenominator
-      }
-      //Until all elements are assigned
-      while (nextIndex < numElem()) {
-	 //pick the region
-	 regionVar = rand() % costDenominator;
-	 Index_t i = 0;
-         while(regionVar >= regBinEnd[i])
-	    i++;
-         //rotate the regions based on MPI rank.  Rotation is Rank % NumRegions this makes each domain have a different region with 
-         //the highest representation
-	 regionNum = ((i + myRank) % numReg()) + 1;
-	 // make sure we don't pick the same region twice in a row
-         while(regionNum == lastReg) {
-	    regionVar = rand() % costDenominator;
-	    i = 0;
-            while(regionVar >= regBinEnd[i])
-	       i++;
-	    regionNum = ((i + myRank) % numReg()) + 1;
-         }
-	 //Pick the bin size of the region and determine the number of elements.
-         binSize = rand() % 1000;
-	 if(binSize < 773) {
-	   elements = rand() % 15 + 1;
-	 }
-	 else if(binSize < 937) {
-	   elements = rand() % 16 + 16;
-	 }
-	 else if(binSize < 970) {
-	   elements = rand() % 32 + 32;
-	 }
-	 else if(binSize < 974) {
-	   elements = rand() % 64 + 64;
-	 } 
-	 else if(binSize < 978) {
-	   elements = rand() % 128 + 128;
-	 }
-	 else if(binSize < 981) {
-	   elements = rand() % 256 + 256;
-	 }
-	 else
-	    elements = rand() % 1537 + 512;
-	 runto = elements + nextIndex;
-	 //Store the elements.  If we hit the end before we run out of elements then just stop.
-         while (nextIndex < runto && nextIndex < numElem()) {
-	    this->regNumList(nextIndex) = regionNum;
-	    nextIndex++;
-	 }
-	 lastReg = regionNum;
-      }
-
-      delete [] regBinEnd; 
-   }
-   // Convert regNumList to region index sets
-   // First, count size of each region 
-   for (Index_t i=0 ; i<numElem() ; ++i) {
-      int r = this->regNumList(i)-1; // region index == regnum-1
-      regElemSize(r)++;
-   }
-   // Second, allocate each region index set
-   for (Index_t i=0 ; i<numReg() ; ++i) {
-      m_regElemlist[i] = new Index_t[regElemSize(i)];
-      regElemSize(i) = 0;
-   }
-   // Third, fill index sets
-   for (Index_t i=0 ; i<numElem() ; ++i) {
-      Index_t r = regNumList(i)-1;       // region index == regnum-1
-      Index_t regndx = regElemSize(r)++; // Note increment
-      regElemlist(r,regndx) = i;
-   }
-   
-}
-
-/////////////////////////////////////////////////////////////
-void 
-Domain::SetupSymmetryPlanes(Int_t edgeNodes)
-{
-  Index_t nidx = 0 ;
-  for (Index_t i=0; i<edgeNodes; ++i) {
-    Index_t planeInc = i*edgeNodes*edgeNodes ;
-    Index_t rowInc   = i*edgeNodes ;
-    for (Index_t j=0; j<edgeNodes; ++j) {
-      if (m_planeLoc == 0) {
-	m_symmZ[nidx] = rowInc   + j ;
-      }
-      if (m_rowLoc == 0) {
-	m_symmY[nidx] = planeInc + j ;
-      }
-      if (m_colLoc == 0) {
-	m_symmX[nidx] = planeInc + j*edgeNodes ;
-      }
-      ++nidx ;
-    }
-  }
-}
-
-
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupElementConnectivities(Int_t edgeElems)
-{
-   lxim(0) = 0 ;
-   for (Index_t i=1; i<numElem(); ++i) {
-      lxim(i)   = i-1 ;
-      lxip(i-1) = i ;
-   }
-   lxip(numElem()-1) = numElem()-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      letam(i) = i ; 
-      letap(numElem()-edgeElems+i) = numElem()-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<numElem(); ++i) {
-      letam(i) = i-edgeElems ;
-      letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      lzetam(i) = i ;
-      lzetap(numElem()-edgeElems*edgeElems+i) = numElem()-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<numElem(); ++i) {
-      lzetam(i) = i - edgeElems*edgeElems ;
-      lzetap(i-edgeElems*edgeElems) = i ;
-   }
-}
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupBoundaryConditions(Int_t edgeElems) 
-{
-  Index_t ghostIdx[6] ;  // offsets to ghost locations
-
-  // set up boundary condition information
-  for (Index_t i=0; i<numElem(); ++i) {
-     elemBC(i) = Int_t(0) ;
-  }
-
-  for (Index_t i=0; i<6; ++i) {
-    ghostIdx[i] = INT_MIN ;
-  }
-
-  Int_t pidx = numElem() ;
-  if (m_planeMin != 0) {
-    ghostIdx[0] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_planeMax != 0) {
-    ghostIdx[1] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_rowMin != 0) {
-    ghostIdx[2] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_rowMax != 0) {
-    ghostIdx[3] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_colMin != 0) {
-    ghostIdx[4] = pidx ;
-    pidx += sizeY()*sizeZ() ;
-  }
-
-  if (m_colMax != 0) {
-    ghostIdx[5] = pidx ;
-  }
-
-  // symmetry plane or free surface BCs 
-  for (Index_t i=0; i<edgeElems; ++i) {
-    Index_t planeInc = i*edgeElems*edgeElems ;
-    Index_t rowInc   = i*edgeElems ;
-    for (Index_t j=0; j<edgeElems; ++j) {
-      if (m_planeLoc == 0) {
-	elemBC(rowInc+j) |= ZETA_M_SYMM ;
-      }
-      else {
-	elemBC(rowInc+j) |= ZETA_M_COMM ;
-	lzetam(rowInc+j) = ghostIdx[0] + rowInc + j ;
-      }
-
-      if (m_planeLoc == m_tp-1) {
-	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-	  ZETA_P_FREE;
-      }
-      else {
-	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-	  ZETA_P_COMM ;
-	lzetap(rowInc+j+numElem()-edgeElems*edgeElems) =
-	  ghostIdx[1] + rowInc + j ;
-      }
-
-      if (m_rowLoc == 0) {
-	elemBC(planeInc+j) |= ETA_M_SYMM ;
-      }
-      else {
-	elemBC(planeInc+j) |= ETA_M_COMM ;
-	letam(planeInc+j) = ghostIdx[2] + rowInc + j ;
-      }
-
-      if (m_rowLoc == m_tp-1) {
-	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-	  ETA_P_FREE ;
-      }
-      else {
-	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-	  ETA_P_COMM ;
-	letap(planeInc+j+edgeElems*edgeElems-edgeElems) =
-	  ghostIdx[3] +  rowInc + j ;
-      }
-
-      if (m_colLoc == 0) {
-	elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-      }
-      else {
-	elemBC(planeInc+j*edgeElems) |= XI_M_COMM ;
-	lxim(planeInc+j*edgeElems) = ghostIdx[4] + rowInc + j ;
-      }
-
-      if (m_colLoc == m_tp-1) {
-	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-      }
-      else {
-	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_COMM ;
-	lxip(planeInc+j*edgeElems+edgeElems-1) =
-	  ghostIdx[5] + rowInc + j ;
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side)
-{
-   Int_t testProcs;
-   Int_t dx, dy, dz;
-   Int_t myDom;
-   
-   // Assume cube processor layout for now 
-   testProcs = Int_t(cbrt(Real_t(numRanks))+0.5) ;
-   if (testProcs*testProcs*testProcs != numRanks) {
-      printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (sizeof(Real_t) != 4 && sizeof(Real_t) != 8) {
-      printf("MPI operations only support float and double right now...\n");
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) {
-      printf("corner element comm buffers too small.  Fix code.\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-
-   dx = testProcs ;
-   dy = testProcs ;
-   dz = testProcs ;
-
-   // temporary test
-   if (dx*dy*dz != numRanks) {
-      printf("error -- must have as many domains as procs\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   Int_t remainder = dx*dy*dz % numRanks ;
-   if (myRank < remainder) {
-      myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ;
-   }
-   else {
-      myDom = remainder*( 1+ (dx*dy*dz / numRanks)) +
-         (myRank - remainder)*(dx*dy*dz/numRanks) ;
-   }
-
-   *col = myDom % dx ;
-   *row = (myDom / dx) % dy ;
-   *plane = myDom / (dx*dy) ;
-   *side = testProcs;
-
-   return;
-}
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-util.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-util.cc
deleted file mode 100644
index bdade86d9..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-util.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#if USE_MPI
-#include <mpi.h>
-#endif
-#include "lulesh.h"
-
-/* Helper function for converting strings to ints, with error checking */
-int StrToInt(const char *token, int *retVal)
-{
-   const char *c ;
-   char *endptr ;
-   const int decimal_base = 10 ;
-
-   if (token == NULL)
-      return 0 ;
-   
-   c = token ;
-   *retVal = (int)strtol(c, &endptr, decimal_base) ;
-   if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0')))
-      return 1 ;
-   else
-      return 0 ;
-}
-
-static void PrintCommandLineOptions(char *execname, int myRank)
-{
-   if (myRank == 0) {
-
-      printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-   }
-}
-
-static void ParseError(const char *message, int myRank)
-{
-   if (myRank == 0) {
-      printf("%s\n", message);
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-      exit(-1);
-#endif
-   }
-}
-
-void ParseCommandLineOptions(int argc, char *argv[],
-                             int myRank, struct cmdLineOpts *opts)
-{
-   if(argc > 1) {
-      int i = 1;
-
-      while(i < argc) {
-         int ok;
-         /* -i <iterations> */
-         if(strcmp(argv[i], "-i") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -i", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->its));
-            if(!ok) {
-               ParseError("Parse Error on option -i integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -s <size, sidelength> */
-         else if(strcmp(argv[i], "-s") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -s\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->nx));
-            if(!ok) {
-               ParseError("Parse Error on option -s integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -r <numregions> */
-         else if (strcmp(argv[i], "-r") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -r\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numReg));
-            if (!ok) {
-               ParseError("Parse Error on option -r integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -f <numfilepieces> */
-         else if (strcmp(argv[i], "-f") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -f\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numFiles));
-            if (!ok) {
-               ParseError("Parse Error on option -f integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -p */
-         else if (strcmp(argv[i], "-p") == 0) {
-            opts->showProg = 1;
-            i++;
-         }
-         /* -q */
-         else if (strcmp(argv[i], "-q") == 0) {
-            opts->quiet = 1;
-            i++;
-         }
-         else if (strcmp(argv[i], "-b") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -b\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->balance));
-            if (!ok) {
-               ParseError("Parse Error on option -b integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         else if (strcmp(argv[i], "-c") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -c\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->cost));
-            if (!ok) {
-               ParseError("Parse Error on option -c integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -v */
-         else if (strcmp(argv[i], "-v") == 0) {
-#if VIZ_MESH            
-            opts->viz = 1;
-#else
-            ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank);
-#endif
-            i++;
-         }
-         /* -h */
-         else if (strcmp(argv[i], "-h") == 0) {
-            PrintCommandLineOptions(argv[0], myRank);
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, 0);
-#else
-            exit(0);
-#endif
-         }
-         else {
-            char msg[80];
-            PrintCommandLineOptions(argv[0], myRank);
-            sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]);
-            ParseError(msg, myRank);
-         }
-      }
-   }
-}
-
-/////////////////////////////////////////////////////////////////////
-
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks)
-{
-   // GrindTime1 only takes a single domain into account, and is thus a good way to measure
-   // processor speed indepdendent of MPI parallelism.
-   // GrindTime2 takes into account speedups from MPI parallelism 
-   Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx);
-   Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx*numRanks);
-
-   Index_t ElemId = 0;
-   printf("Run completed:  \n");
-   printf("   Problem size        =  %i \n",    nx);
-   printf("   MPI tasks           =  %i \n",    numRanks);
-   printf("   Iteration count     =  %i \n",    locDom.cycle());
-   printf("   Final Origin Energy = %12.6e \n", locDom.e(ElemId));
-
-   Real_t   MaxAbsDiff = Real_t(0.0);
-   Real_t TotalAbsDiff = Real_t(0.0);
-   Real_t   MaxRelDiff = Real_t(0.0);
-
-   for (Index_t j=0; j<nx; ++j) {
-      for (Index_t k=j+1; k<nx; ++k) {
-         Real_t AbsDiff = FABS(locDom.e(j*nx+k)-locDom.e(k*nx+j));
-         TotalAbsDiff  += AbsDiff;
-
-         if (MaxAbsDiff <AbsDiff) {
-            MaxAbsDiff = AbsDiff;
-         }
-
-         if (locDom.e(k*nx+j) != 0.0) {
-            Real_t RelDiff = AbsDiff / locDom.e(k*nx+j);
-            if (MaxRelDiff <RelDiff) {
-               MaxRelDiff = RelDiff;
-            }
-         }
-      }
-   }
-
-   // Quick symmetry check
-   printf("   Testing Plane 0 of Energy Array on rank 0:\n");
-   printf("        MaxAbsDiff   = %12.6e\n",   MaxAbsDiff   );
-   printf("        TotalAbsDiff = %12.6e\n",   TotalAbsDiff );
-   printf("        MaxRelDiff   = %12.6e\n\n", MaxRelDiff   );
-
-   // Timing information
-   printf("\nElapsed time         = %10.2f (s)\n", elapsed_time);
-   printf("Grind time (us/z/c)  = %10.8g (per dom)  (%10.8g overall)\n", grindTime1, grindTime2);
-   printf("FOM                  = %10.8g (z/s)\n\n", 1000.0/grindTime2); // zones per second
-
-   return ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-viz.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-viz.cc
deleted file mode 100644
index f0d1f36e4..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh-viz.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "lulesh.h"
-
-#if defined(VIZ_MESH)
-
-#ifdef __cplusplus
-  extern "C" {
-#endif
-#include "silo.h"
-#if USE_MPI
-# include "pmpio.h"
-#endif
-#ifdef __cplusplus
-  }
-#endif
-
-// Function prototypes
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank);
-static
-
-
-#if USE_MPI
-// For some reason, earlier versions of g++ (e.g. 4.2) won't let me
-// put the 'static' qualifier on this prototype, even if it's done
-// consistently in the prototype and definition
-void
-DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                      char basename[], int numRanks);
-
-// Callback prototypes for PMPIO interface (only useful if we're
-// running parallel)
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata);
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata);
-static void
-LULESH_PMPIO_Close(void *file, void *udata);
-
-#else
-void
-DumpMultiblockObjects(DBfile *db, char basename[], int numRanks);
-#endif
-
-
-/**********************************************************************/
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 
-{
-  char subdirName[32];
-  char basename[32];
-  DBfile *db;
-
-
-  sprintf(basename, "lulesh_plot_c%d", domain.cycle());
-  sprintf(subdirName, "data_%d", myRank);
-
-#if USE_MPI
-
-  PMPIO_baton_t *bat = PMPIO_Init(numFiles,
-				  PMPIO_WRITE,
-				  MPI_COMM_WORLD,
-				  10101,
-				  LULESH_PMPIO_Create,
-				  LULESH_PMPIO_Open,
-				  LULESH_PMPIO_Close,
-				  NULL);
-
-  int myiorank = PMPIO_GroupRank(bat, myRank);
-
-  char fileName[64];
-  
-  if (myiorank == 0) 
-    strcpy(fileName, basename);
-  else
-    sprintf(fileName, "%s.%03d", basename, myiorank);
-
-  db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName);
-
-  DumpDomainToVisit(db, domain, myRank);
-
-  // Processor 0 writes out bit of extra data to its file that
-  // describes how to stitch all the pieces together
-  if (myRank == 0) {
-    DumpMultiblockObjects(db, bat, basename, numRanks);
-  }
-
-  PMPIO_HandOffBaton(bat, db);
-
-  PMPIO_Finish(bat);
-#else
-
-  db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-  if (db) {
-     DBMkDir(db, subdirName);
-     DBSetDir(db, subdirName);
-     DumpDomainToVisit(db, domain, myRank);
-     DumpMultiblockObjects(db, basename, numRanks);
-  }
-  else {
-     printf("Error writing out viz file - rank %d\n", myRank);
-  }
-
-#endif
-}
-
-
-
-/**********************************************************************/
-
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank)
-{
-   int ok = 0;
-   
-   /* Create an option list that will give some hints to VisIt for
-    * printing out the cycle and time in the annotations */
-   DBoptlist *optlist;
-
-
-   /* Write out the mesh connectivity in fully unstructured format */
-   int shapetype[1] = {DB_ZONETYPE_HEX};
-   int shapesize[1] = {8};
-   int shapecnt[1] = {domain.numElem()};
-   int *conn = new int[domain.numElem()*8] ;
-   int ci = 0 ;
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      Index_t *elemToNode = domain.nodelist(ei) ;
-      for (int ni=0; ni < 8; ++ni) {
-         conn[ci++] = elemToNode[ni] ;
-      }
-   }
-   ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3,
-                        conn, domain.numElem()*8,
-                        0,0,0, /* Not carrying ghost zones */
-                        shapetype, shapesize, shapecnt,
-                        1, NULL);
-   delete [] conn ;
-
-   /* Write out the mesh coordinates associated with the mesh */
-   const char* coordnames[3] = {"X", "Y", "Z"};
-   float *coords[3] ;
-   coords[0] = new float[domain.numNode()] ;
-   coords[1] = new float[domain.numNode()] ;
-   coords[2] = new float[domain.numNode()] ;
-   for (int ni=0; ni < domain.numNode() ; ++ni) {
-      coords[0][ni] = float(domain.x(ni)) ;
-      coords[1][ni] = float(domain.y(ni)) ;
-      coords[2][ni] = float(domain.z(ni)) ;
-   }
-   optlist = DBMakeOptlist(2);
-   ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time());
-   ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle());
-   ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords,
-                      domain.numNode(), domain.numElem(), "connectivity",
-                      0, DB_FLOAT, optlist);
-   ok += DBFreeOptlist(optlist);
-   delete [] coords[2] ;
-   delete [] coords[1] ;
-   delete [] coords[0] ;
-
-   /* Write out the materials */
-   int *matnums = new int[domain.numReg()];
-   int dims[1] = {domain.numElem()}; // No mixed elements
-   for(int i=0 ; i<domain.numReg() ; ++i)
-      matnums[i] = i+1;
-   
-   ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(),
-                       matnums, domain.regNumList(), dims, 1,
-                       NULL, NULL, NULL, NULL, 0, DB_FLOAT, NULL);
-   delete [] matnums;
-
-   /* Write out pressure, energy, relvol, q */
-
-   float *e = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      e[ei] = float(domain.e(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "e", "mesh", e,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] e ;
-
-
-   float *p = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      p[ei] = float(domain.p(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "p", "mesh", p,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] p ;
-
-   float *v = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      v[ei] = float(domain.v(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "v", "mesh", v,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] v ;
-
-   float *q = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      q[ei] = float(domain.q(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "q", "mesh", q,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] q ;
-
-   /* Write out nodal speed, velocities */
-   float *zd    = new float[domain.numNode()];
-   float *yd    = new float[domain.numNode()];
-   float *xd    = new float[domain.numNode()];
-   float *speed = new float[domain.numNode()];
-   for(int ni=0 ; ni < domain.numNode() ; ++ni) {
-      xd[ni]    = float(domain.xd(ni));
-      yd[ni]    = float(domain.yd(ni));
-      zd[ni]    = float(domain.zd(ni));
-      speed[ni] = float(sqrt((xd[ni]*xd[ni])+(yd[ni]*yd[ni])+(zd[ni]*zd[ni])));
-   }
-
-   ok += DBPutUcdvar1(db, "speed", "mesh", speed,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] speed;
-
-
-   ok += DBPutUcdvar1(db, "xd", "mesh", xd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] xd ;
-
-   ok += DBPutUcdvar1(db, "yd", "mesh", yd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] yd ;
-
-   ok += DBPutUcdvar1(db, "zd", "mesh", zd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] zd ;
-
-
-   if (ok != 0) {
-      printf("Error writing out viz file - rank %d\n", myRank);
-   }
-}
-
-/**********************************************************************/
-
-#if USE_MPI     
-void
-   DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                         char basename[], int numRanks)
-#else
-void
-  DumpMultiblockObjects(DBfile *db, char basename[], int numRanks)
-#endif
-{
-   /* MULTIBLOCK objects to tie together multiple files */
-  char **multimeshObjs;
-  char **multimatObjs;
-  char ***multivarObjs;
-  int *blockTypes;
-  int *varTypes;
-  int ok = 0;
-  // Make sure this list matches what's written out above
-  char vars[][10] = {"p","e","v","q", "speed", "xd", "yd", "zd"};
-  int numvars = sizeof(vars)/sizeof(vars[0]);
-
-  // Reset to the root directory of the silo file
-  DBSetDir(db, "/");
-
-  // Allocate a bunch of space for building up the string names
-  multimeshObjs = new char*[numRanks];
-  multimatObjs = new char*[numRanks];
-  multivarObjs = new char**[numvars];
-  blockTypes = new int[numRanks];
-  varTypes = new int[numRanks];
-
-  for(int v=0 ; v<numvars ; ++v) {
-     multivarObjs[v] = new char*[numRanks];
-  }
-  
-  for(int i=0 ; i<numRanks ; ++i) {
-     multimeshObjs[i] = new char[64];
-     multimatObjs[i] = new char[64];
-     for(int v=0 ; v<numvars ; ++v) {
-        multivarObjs[v][i] = new char[64];
-     }
-     blockTypes[i] = DB_UCDMESH;
-     varTypes[i] = DB_UCDVAR;
-  }
-      
-  // Build up the multiobject names
-  for(int i=0 ; i<numRanks ; ++i) {
-#if USE_MPI     
-    int iorank = PMPIO_GroupRank(bat, i);
-#else
-    int iorank = 0;
-#endif
-
-    //delete multivarObjs[i];
-    if (iorank == 0) {
-      snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i);
-      snprintf(multimatObjs[i], 64, "/data_%d/regions",i);
-      for(int v=0 ; v<numvars ; ++v) {
-	snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]);
-      }
-     
-    }
-    else {
-      snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh",
-               basename, iorank, i);
-      snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", 
-	       basename, iorank, i);
-      for(int v=0 ; v<numvars ; ++v) {
-         snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", 
-                  basename, iorank, i, vars[v]);
-      }
-    }
-  }
-
-  // Now write out the objects
-  ok += DBPutMultimesh(db, "mesh", numRanks,
-		       (char**)multimeshObjs, blockTypes, NULL);
-  ok += DBPutMultimat(db, "regions", numRanks,
-		      (char**)multimatObjs, NULL);
-  for(int v=0 ; v<numvars ; ++v) {
-     ok += DBPutMultivar(db, vars[v], numRanks,
-                         (char**)multivarObjs[v], varTypes, NULL);
-  }
-
-  for(int v=0; v < numvars; ++v) {
-    for(int i = 0; i < numRanks; i++) {
-      delete multivarObjs[v][i];
-    }
-    delete multivarObjs[v];
-  }
-
-  // Clean up
-  for(int i=0 ; i<numRanks ; i++) {
-    delete multimeshObjs[i];
-    delete multimatObjs[i];
-  }
-  delete [] multimeshObjs;
-  delete [] multimatObjs;
-  delete [] multivarObjs;
-  delete [] blockTypes;
-  delete [] varTypes;
-
-  if (ok != 0) {
-    printf("Error writing out multiXXX objs to viz file - rank 0\n");
-  }
-}
-
-# if USE_MPI
-
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata)
-{
-   /* Create the file */
-   DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata)
-{
-   /* Open the file */
-  DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void
-LULESH_PMPIO_Close(void *file, void *udata)
-{
-  DBfile *db = (DBfile*)file;
-  if (db)
-    DBClose(db);
-}
-# endif
-
-   
-#else
-
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks)
-{
-   if (myRank == 0) {
-      printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n");
-   }
-}
-
-#endif
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc
deleted file mode 100644
index 04002bf8c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc
+++ /dev/null
@@ -1,2727 +0,0 @@
-/*
-  This is a Version 2.0 MPI + OpenMP implementation of LULESH
-
-                 Copyright (c) 2010-2013.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 2.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-//////////////
-DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
-* Addition of regions to make work more representative of multi-material codes
-* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
-  more representative of our actual working set sizes
-* Single source distribution supports pure serial, pure OpenMP, MPI-only, 
-  and MPI+OpenMP
-* Addition of ability to visualize the mesh using VisIt 
-  https://wci.llnl.gov/codes/visit/download.html
-* Various command line options (see ./lulesh2.0 -h)
- -q              : quiet mode - suppress stdout
- -i <iterations> : number of cycles to run
- -s <size>       : length of cube mesh along side
- -r <numregions> : Number of distinct regions (def: 11)
- -b <balance>    : Load balance between regions of a domain (def: 1)
- -c <cost>       : Extra cost of more expensive regions (def: 1)
- -f <filepieces> : Number of file parts for viz output (def: np/9)
- -p              : Print out progress
- -v              : Output viz file (requires compiling with -DVIZ_MESH
- -h              : This message
-
- printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-
-*Notable changes in LULESH 2.0
-
-* Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-*
-* The concept of "regions" was added, although every region is the same ideal
-*    gas material, and the same sedov blast wave problem is still the only
-*    problem its hardcoded to solve.
-* Regions allow two things important to making this proxy app more representative:
-*   Four of the LULESH routines are now performed on a region-by-region basis,
-*     making the memory access patterns non-unit stride
-*   Artificial load imbalances can be easily introduced that could impact
-*     parallelization strategies.  
-* The load balance flag changes region assignment.  Region number is raised to
-*   the power entered for assignment probability.  Most likely regions changes
-*   with MPI process id.
-* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
-*   entered multiple. The cost of 5% is 10x the entered multiple.
-* MPI and OpenMP were added, and coalesced into a single version of the source
-*   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-* Added support to write plot files using "poor mans parallel I/O" when linked
-*   with the silo library, which in turn can be read by VisIt.
-* Enabled variable timestep calculation by default (courant condition), which
-*   results in an additional reduction.
-* Default domain (mesh) size reduced from 45^3 to 30^3
-* Command line options to allow numerous test cases without needing to recompile
-* Performance optimizations and code cleanup beyond LULESH 1.0
-* Added a "Figure of Merit" calculation (elements solved per microsecond) and
-*   output in support of using LULESH 2.0 for the 2017 CORAL procurement
-*
-* Possible Differences in Final Release (other changes possible)
-*
-* High Level mesh structure to allow data structure transformations
-* Different default parameters
-* Minor code performance changes and cleanup
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
-//////////////
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <climits>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
-#include <unistd.h>
-
-#if USE_OMP
-# include <omp.h>
-#endif
-
-#include "lulesh.h"
-
-#include "Timer.hxx"
-
-
-
-#define RAJA_STORAGE static inline
-
-//typedef RAJA::seq_exec              Segment_Exec;
-//typedef RAJA::simd_exec             Segment_Exec;
-//typedef RAJA::seq_reduce            reduce_policy;
-
-typedef RAJA::omp_parallel_for_exec Segment_Exec;
-typedef RAJA::omp_reduce            reduce_policy;
-
-//typedef RAJA::cilk_for_exec         Segment_Exec;
-//typedef RAJA::cilk_reduce            reduce_policy;
-
-typedef Segment_Exec node_exec_policy;
-typedef Segment_Exec elem_exec_policy;
-typedef Segment_Exec min_exec_policy;
-typedef Segment_Exec mat_exec_policy;
-typedef Segment_Exec range_exec_policy;
-
-
-
-/*********************************/
-/* Data structure implementation */
-/*********************************/
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-
-/******************************************/
-
-/* Work Routines */
-
-RAJA_STORAGE
-void TimeIncrement(Domain& domain)
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t gnewdt = Real_t(1.0e+20) ;
-      Real_t newdt ;
-      if (domain.dtcourant() < gnewdt) {
-         gnewdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < gnewdt) {
-         gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-#if USE_MPI      
-      MPI_Allreduce(&gnewdt, &newdt, 1,
-                    ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
-#else
-      newdt = gnewdt;
-#endif
-      
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Domain* domain,
-                                   const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain->x(nd0i);
-   elemX[1] = domain->x(nd1i);
-   elemX[2] = domain->x(nd2i);
-   elemX[3] = domain->x(nd3i);
-   elemX[4] = domain->x(nd4i);
-   elemX[5] = domain->x(nd5i);
-   elemX[6] = domain->x(nd6i);
-   elemX[7] = domain->x(nd7i);
-
-   elemY[0] = domain->y(nd0i);
-   elemY[1] = domain->y(nd1i);
-   elemY[2] = domain->y(nd2i);
-   elemY[3] = domain->y(nd3i);
-   elemY[4] = domain->y(nd4i);
-   elemY[5] = domain->y(nd5i);
-   elemY[6] = domain->y(nd6i);
-   elemY[7] = domain->y(nd7i);
-
-   elemZ[0] = domain->z(nd0i);
-   elemZ[1] = domain->z(nd1i);
-   elemZ[2] = domain->z(nd2i);
-   elemZ[3] = domain->z(nd3i);
-   elemZ[4] = domain->z(nd4i);
-   elemZ[5] = domain->z(nd5i);
-   elemZ[6] = domain->z(nd6i);
-   elemZ[7] = domain->z(nd7i);
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void InitStressTermsForElems(Domain* domain,
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                             Index_t numElem)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      sigxx[i] = sigyy[i] = sigzz[i] =  - domain->p(i) - domain->q(i) ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemShapeFunctionDerivatives( Real_t const x[],
-                                       Real_t const y[],
-                                       Real_t const z[],
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* fx, Real_t* fy, Real_t* fz )
-{
-   for(Index_t i = 0; i < 8; i++) {
-      fx[i] = -( stress_xx * B[0][i] );
-      fy[i] = -( stress_yy * B[1][i]  );
-      fz[i] = -( stress_zz * B[2][i] );
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void IntegrateStressForElems( Domain* domain,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ, Index_t numElem, Index_t numNode)
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem;
-   Real_t *fy_elem;
-   Real_t *fz_elem;
-   Real_t fx_local[8] ;
-   Real_t fy_local[8] ;
-   Real_t fz_local[8] ;
-
-   Real_t* tfx_local = fx_local;
-   Real_t* tfy_local = fy_local;
-   Real_t* tfz_local = fz_local;
-
-
-  if (numthreads > 1) {
-     fx_elem = Allocate<Real_t>(numElem8) ;
-     fy_elem = Allocate<Real_t>(numElem8) ;
-     fz_elem = Allocate<Real_t>(numElem8) ;
-  }
-  // loop over all elements
-
-  RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-    const Index_t* const elemToNode = domain->nodelist(k);
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // Volume calculation involves extra work for numerical consistency
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    if (numthreads > 1) {
-       // Eliminate thread writing conflicts at the nodes by giving
-       // each element its own copy to write to
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    &fx_elem[k*8],
-                                    &fy_elem[k*8],
-                                    &fz_elem[k*8] ) ;
-    }
-    else {
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    tfx_local, tfy_local, tfz_local ) ;
-
-       // copy nodal force contributions to global force arrray.
-       for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
-          Index_t gnode = elemToNode[lnode];
-          domain->fx(gnode) += tfx_local[lnode];
-          domain->fy(gnode) += tfy_local[lnode];
-          domain->fz(gnode) += tfz_local[lnode];
-       }
-    }
-  } );
-
-  if (numthreads > 1) {
-     // If threaded, then we need to copy the data out of the temporary
-     // arrays used above into the final forces field
-     RAJA::forall<node_exec_policy>(0, numNode, [=] (int gnode) {
-        Index_t count = domain->nodeElemCount(gnode) ;
-        Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-        Real_t fx_tmp = Real_t(0.0) ;
-        Real_t fy_tmp = Real_t(0.0) ;
-        Real_t fz_tmp = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t ielem = cornerList[i] ;
-           fx_tmp += fx_elem[ielem] ;
-           fy_tmp += fy_elem[ielem] ;
-           fz_tmp += fz_elem[ielem] ;
-        }
-        domain->fx(gnode) = fx_tmp ;
-        domain->fy(gnode) = fy_tmp ;
-        domain->fz(gnode) = fz_tmp ;
-     } );
-     Release(&fz_elem) ;
-     Release(&fy_elem) ;
-     Release(&fx_elem) ;
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
-                              Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Real_t hxx[4];
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
-               hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
-               hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
-               hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfx[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
-               hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
-               hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
-               hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfy[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
-               hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
-               hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
-               hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfz[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Domain* domain,
-                                   Real_t *determ,
-                                   Real_t *x8n, Real_t *y8n, Real_t *z8n,
-                                   Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
-                                   Real_t hourg, Index_t numElem,
-                                   Index_t numNode)
-{
-
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-  
-   Index_t numElem8 = numElem * 8 ;
-
-   Real_t *fx_elem; 
-   Real_t *fy_elem; 
-   Real_t *fz_elem; 
-
-   if(numthreads > 1) {
-      fx_elem = Allocate<Real_t>(numElem8) ;
-      fy_elem = Allocate<Real_t>(numElem8) ;
-      fz_elem = Allocate<Real_t>(numElem8) ;
-   }
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i2) {
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam[8][4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = domain->nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain->ss(i2);
-      mass1=domain->elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain->xd(n0si2);
-      xd1[1] = domain->xd(n1si2);
-      xd1[2] = domain->xd(n2si2);
-      xd1[3] = domain->xd(n3si2);
-      xd1[4] = domain->xd(n4si2);
-      xd1[5] = domain->xd(n5si2);
-      xd1[6] = domain->xd(n6si2);
-      xd1[7] = domain->xd(n7si2);
-
-      yd1[0] = domain->yd(n0si2);
-      yd1[1] = domain->yd(n1si2);
-      yd1[2] = domain->yd(n2si2);
-      yd1[3] = domain->yd(n3si2);
-      yd1[4] = domain->yd(n4si2);
-      yd1[5] = domain->yd(n5si2);
-      yd1[6] = domain->yd(n6si2);
-      yd1[7] = domain->yd(n7si2);
-
-      zd1[0] = domain->zd(n0si2);
-      zd1[1] = domain->zd(n1si2);
-      zd1[2] = domain->zd(n2si2);
-      zd1[3] = domain->zd(n3si2);
-      zd1[4] = domain->zd(n4si2);
-      zd1[5] = domain->zd(n5si2);
-      zd1[6] = domain->zd(n6si2);
-      zd1[7] = domain->zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      // With the threaded version, we write into local arrays per elem
-      // so we don't have to worry about race conditions
-      if (numthreads > 1) {
-         fx_local = &fx_elem[i3] ;
-         fx_local[0] = hgfx[0];
-         fx_local[1] = hgfx[1];
-         fx_local[2] = hgfx[2];
-         fx_local[3] = hgfx[3];
-         fx_local[4] = hgfx[4];
-         fx_local[5] = hgfx[5];
-         fx_local[6] = hgfx[6];
-         fx_local[7] = hgfx[7];
-
-         fy_local = &fy_elem[i3] ;
-         fy_local[0] = hgfy[0];
-         fy_local[1] = hgfy[1];
-         fy_local[2] = hgfy[2];
-         fy_local[3] = hgfy[3];
-         fy_local[4] = hgfy[4];
-         fy_local[5] = hgfy[5];
-         fy_local[6] = hgfy[6];
-         fy_local[7] = hgfy[7];
-
-         fz_local = &fz_elem[i3] ;
-         fz_local[0] = hgfz[0];
-         fz_local[1] = hgfz[1];
-         fz_local[2] = hgfz[2];
-         fz_local[3] = hgfz[3];
-         fz_local[4] = hgfz[4];
-         fz_local[5] = hgfz[5];
-         fz_local[6] = hgfz[6];
-         fz_local[7] = hgfz[7];
-      }
-      else {
-         domain->fx(n0si2) += hgfx[0];
-         domain->fy(n0si2) += hgfy[0];
-         domain->fz(n0si2) += hgfz[0];
-
-         domain->fx(n1si2) += hgfx[1];
-         domain->fy(n1si2) += hgfy[1];
-         domain->fz(n1si2) += hgfz[1];
-
-         domain->fx(n2si2) += hgfx[2];
-         domain->fy(n2si2) += hgfy[2];
-         domain->fz(n2si2) += hgfz[2];
-
-         domain->fx(n3si2) += hgfx[3];
-         domain->fy(n3si2) += hgfy[3];
-         domain->fz(n3si2) += hgfz[3];
-
-         domain->fx(n4si2) += hgfx[4];
-         domain->fy(n4si2) += hgfy[4];
-         domain->fz(n4si2) += hgfz[4];
-
-         domain->fx(n5si2) += hgfx[5];
-         domain->fy(n5si2) += hgfy[5];
-         domain->fz(n5si2) += hgfz[5];
-
-         domain->fx(n6si2) += hgfx[6];
-         domain->fy(n6si2) += hgfy[6];
-         domain->fz(n6si2) += hgfz[6];
-
-         domain->fx(n7si2) += hgfx[7];
-         domain->fy(n7si2) += hgfy[7];
-         domain->fz(n7si2) += hgfz[7];
-      }
-   } );
-
-   if (numthreads > 1) {
-     // Collect the data from the local arrays into the final force arrays
-      RAJA::forall<node_exec_policy>(0, numNode, [=] (int gnode) {
-         Index_t count = domain->nodeElemCount(gnode) ;
-         Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-         Real_t fx_tmp = Real_t(0.0) ;
-         Real_t fy_tmp = Real_t(0.0) ;
-         Real_t fz_tmp = Real_t(0.0) ;
-         for (Index_t i=0 ; i < count ; ++i) {
-            Index_t ielem = cornerList[i] ;
-            fx_tmp += fx_elem[ielem] ;
-            fy_tmp += fy_elem[ielem] ;
-            fz_tmp += fz_elem[ielem] ;
-         }
-         domain->fx(gnode) += fx_tmp ;
-         domain->fy(gnode) += fy_tmp ;
-         domain->fz(gnode) += fz_tmp ;
-      } );
-      Release(&fz_elem) ;
-      Release(&fy_elem) ;
-      Release(&fx_elem) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain* domain,
-                                  Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain->nodelist(i);
-      CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii) {
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain->volo(i) * domain->v(i);
-
-      /* Do a check for negative volumes */
-      if ( domain->v(i) <= Real_t(0.0) ) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-   } );
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, numElem, domain->numNode()) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain,
-                               sigxx, sigyy, sigzz, determ, numElem,
-                               domain->numNode()) ;
-
-      // check for negative element volume
-      RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-         if (determ[k] <= Real_t(0.0)) {
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-            exit(VolumeError);
-#endif
-         }
-      } );
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE void CalcForceForNodes(Domain* domain)
-{
-  Index_t numNode = domain->numNode() ;
-
-#if USE_MPI  
-  CommRecv(*domain, MSG_COMM_SBN, 3,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-           true, false) ;
-#endif  
-
-  RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     domain->fx(i) = Real_t(0.0) ;
-     domain->fy(i) = Real_t(0.0) ;
-     domain->fz(i) = Real_t(0.0) ;
-  } );
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-#if USE_MPI  
-  Domain_member fieldData[3] ;
-  fieldData[0] = &Domain::fx ;
-  fieldData[1] = &Domain::fy ;
-  fieldData[2] = &Domain::fz ;
-  
-  CommSend(*domain, MSG_COMM_SBN, 3, fieldData,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() +  1,
-           true, false) ;
-  CommSBN(*domain, 3, fieldData) ;
-#endif  
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Domain* domain, Index_t numNode)
-{
-   
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-      domain->xdd(i) = domain->fx(i) / domain->nodalMass(i);
-      domain->ydd(i) = domain->fy(i) / domain->nodalMass(i);
-      domain->zdd(i) = domain->fz(i) / domain->nodalMass(i);
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Domain* domain)
-{
-   Index_t size = domain->sizeX();
-   Index_t numNodeBC = (size+1)*(size+1) ;
-
-   if (!domain->symmXempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->xdd(domain->symmX(i)) = Real_t(0.0) ;
-      } );
-   }
-
-   if (!domain->symmYempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->ydd(domain->symmY(i)) = Real_t(0.0) ;
-      } );
-   }
-
-   if (!domain->symmZempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->zdd(domain->symmZ(i)) = Real_t(0.0) ;
-      } );
-   }
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Domain* domain, const Real_t dt, const Real_t u_cut,
-                          Index_t numNode)
-{
-
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain->xd(i) + domain->xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain->xd(i) = xdtmp ;
-
-     ydtmp = domain->yd(i) + domain->ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain->yd(i) = ydtmp ;
-
-     zdtmp = domain->zd(i) + domain->zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain->zd(i) = zdtmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPositionForNodes(Domain* domain, const Real_t dt, Index_t numNode)
-{
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     domain->x(i) += domain->xd(i) * dt ;
-     domain->y(i) += domain->yd(i) * dt ;
-     domain->z(i) += domain->zd(i) * dt ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeNodal(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   Domain_member fieldData[6] ;
-#endif
-
-   const Real_t delt = domain->deltatime() ;
-   Real_t u_cut = domain->u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-#if USE_MPI  
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif
-   
-   CalcAccelerationForNodes(domain, domain->numNode());
-   
-   ApplyAccelerationBoundaryConditionsForNodes(domain);
-
-   CalcVelocityForNodes( domain, delt, u_cut, domain->numNode()) ;
-
-   CalcPositionForNodes( domain, delt, domain->numNode() );
-#if USE_MPI
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-  fieldData[0] = &Domain::x ;
-  fieldData[1] = &Domain::y ;
-  fieldData[2] = &Domain::z ;
-  fieldData[3] = &Domain::xd ;
-  fieldData[4] = &Domain::yd ;
-  fieldData[5] = &Domain::zd ;
-
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-   CommSyncPosVel(*domain) ;
-#endif
-#endif
-   
-  return;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-/******************************************/
-
-//inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-/******************************************/
-
-//RAJA_STORAGE
-void CalcKinematicsForElems( Domain* domain,
-                             Real_t deltaTime, Index_t numElem )
-{
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) { 
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain->nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain->volo(k) ;
-    domain->vnew(k) = relativeVolume ;
-    domain->delv(k) = relativeVolume - domain->v(k) ;
-
-    // set characteristic length
-    domain->arealg(k) = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain->xd(gnode);
-      yd_local[lnode] = domain->yd(gnode);
-      zd_local[lnode] = domain->zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGradient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain->dxx(k) = D[0];
-    domain->dyy(k) = D[1];
-    domain->dzz(k) = D[2];
-  } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime() ;
-
-      domain->AllocateStrains(numElem);
-
-      CalcKinematicsForElems(domain, deltatime, numElem) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-         // calc strain rate and apply as constraint (only done in FB element)
-         Real_t vdov = domain->dxx(k) + domain->dyy(k) + domain->dzz(k) ;
-         Real_t vdovthird = vdov/Real_t(3.0) ;
-
-         // make the rate of deformation tensor deviatoric
-         domain->vdov(k) = vdov ;
-         domain->dxx(k) -= vdovthird ;
-         domain->dyy(k) -= vdovthird ;
-         domain->dzz(k) -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-         if (domain->vnew(k) <= Real_t(0.0))
-        {
-#if USE_MPI           
-           MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-           exit(VolumeError);
-#endif
-        }
-      } );
-      domain->DeallocateStrains();
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem();
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain->nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain->x(n0) ;
-      Real_t x1 = domain->x(n1) ;
-      Real_t x2 = domain->x(n2) ;
-      Real_t x3 = domain->x(n3) ;
-      Real_t x4 = domain->x(n4) ;
-      Real_t x5 = domain->x(n5) ;
-      Real_t x6 = domain->x(n6) ;
-      Real_t x7 = domain->x(n7) ;
-
-      Real_t y0 = domain->y(n0) ;
-      Real_t y1 = domain->y(n1) ;
-      Real_t y2 = domain->y(n2) ;
-      Real_t y3 = domain->y(n3) ;
-      Real_t y4 = domain->y(n4) ;
-      Real_t y5 = domain->y(n5) ;
-      Real_t y6 = domain->y(n6) ;
-      Real_t y7 = domain->y(n7) ;
-
-      Real_t z0 = domain->z(n0) ;
-      Real_t z1 = domain->z(n1) ;
-      Real_t z2 = domain->z(n2) ;
-      Real_t z3 = domain->z(n3) ;
-      Real_t z4 = domain->z(n4) ;
-      Real_t z5 = domain->z(n5) ;
-      Real_t z6 = domain->z(n6) ;
-      Real_t z7 = domain->z(n7) ;
-
-      Real_t xv0 = domain->xd(n0) ;
-      Real_t xv1 = domain->xd(n1) ;
-      Real_t xv2 = domain->xd(n2) ;
-      Real_t xv3 = domain->xd(n3) ;
-      Real_t xv4 = domain->xd(n4) ;
-      Real_t xv5 = domain->xd(n5) ;
-      Real_t xv6 = domain->xd(n6) ;
-      Real_t xv7 = domain->xd(n7) ;
-
-      Real_t yv0 = domain->yd(n0) ;
-      Real_t yv1 = domain->yd(n1) ;
-      Real_t yv2 = domain->yd(n2) ;
-      Real_t yv3 = domain->yd(n3) ;
-      Real_t yv4 = domain->yd(n4) ;
-      Real_t yv5 = domain->yd(n5) ;
-      Real_t yv6 = domain->yd(n6) ;
-      Real_t yv7 = domain->yd(n7) ;
-
-      Real_t zv0 = domain->zd(n0) ;
-      Real_t zv1 = domain->zd(n1) ;
-      Real_t zv2 = domain->zd(n2) ;
-      Real_t zv3 = domain->zd(n3) ;
-      Real_t zv4 = domain->zd(n4) ;
-      Real_t zv5 = domain->zd(n5) ;
-      Real_t zv6 = domain->zd(n6) ;
-      Real_t zv7 = domain->zd(n7) ;
-
-      Real_t vol = domain->volo(i)*domain->vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
-      Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
-      Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
-      Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
-      Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
-      Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
-      Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain->delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
-      dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
-      dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
-
-      domain->delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain->delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
-      dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
-      dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
-
-      domain->delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain->delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
-      dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
-      dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
-
-      domain->delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(Domain* domain, Int_t r,
-                                  Real_t ptiny)
-{
-   Real_t monoq_limiter_mult = domain->monoq_limiter_mult();
-   Real_t monoq_max_slope = domain->monoq_max_slope();
-   Real_t qlc_monoq = domain->qlc_monoq();
-   Real_t qqc_monoq = domain->qqc_monoq();
-
-   RAJA::forall<mat_exec_policy>(0, domain->regElemSize(r), [=] (int i) { 
-      Index_t ielem = domain->regElemlist(r,i);
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = domain->elemBC(ielem) ;
-      Real_t delvm = 0.0, delvp =0.0;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / (domain->delv_xi(ielem)+ ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case XI_M_COMM: /* needs comm data */
-         case 0:         delvm = domain->delv_xi(domain->lxim(ielem)); break ;
-         case XI_M_SYMM: delvm = domain->delv_xi(ielem) ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & XI_P) {
-         case XI_P_COMM: /* needs comm data */
-         case 0:         delvp = domain->delv_xi(domain->lxip(ielem)) ; break ;
-         case XI_P_SYMM: delvp = domain->delv_xi(ielem) ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain->delv_eta(ielem) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case ETA_M_COMM: /* needs comm data */
-         case 0:          delvm = domain->delv_eta(domain->letam(ielem)) ; break ;
-         case ETA_M_SYMM: delvm = domain->delv_eta(ielem) ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ETA_P) {
-         case ETA_P_COMM: /* needs comm data */
-         case 0:          delvp = domain->delv_eta(domain->letap(ielem)) ; break ;
-         case ETA_P_SYMM: delvp = domain->delv_eta(ielem) ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain->delv_zeta(ielem) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case ZETA_M_COMM: /* needs comm data */
-         case 0:           delvm = domain->delv_zeta(domain->lzetam(ielem)) ; break ;
-         case ZETA_M_SYMM: delvm = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ZETA_P) {
-         case ZETA_P_COMM: /* needs comm data */
-         case 0:           delvp = domain->delv_zeta(domain->lzetap(ielem)) ; break ;
-         case ZETA_P_SYMM: delvp = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain->vdov(ielem) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain->delv_xi(ielem)   * domain->delx_xi(ielem)   ;
-         Real_t delvxeta  = domain->delv_eta(ielem)  * domain->delx_eta(ielem)  ;
-         Real_t delvxzeta = domain->delv_zeta(ielem) * domain->delx_zeta(ielem) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain->elemMass(ielem) / (domain->volo(ielem) * domain->vnew(ielem)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain->qq(ielem) = qquad ;
-      domain->ql(ielem) = qlin  ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain* domain)
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   //
-   // calculate the monotonic q for all regions
-   //
-   for (Index_t r=0 ; r<domain->numReg() ; ++r) {
-      if (domain->regElemSize(r) > 0) {
-         CalcMonotonicQRegionForElems(domain, r, ptiny) ;
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcQForElems(Domain* domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem() ;
-
-   if (numElem != 0) {
-      Int_t allElem = numElem +  /* local elem */
-            2*domain->sizeX()*domain->sizeY() + /* plane ghosts */
-            2*domain->sizeX()*domain->sizeZ() + /* row ghosts */
-            2*domain->sizeY()*domain->sizeZ() ; /* col ghosts */
-
-      domain->AllocateGradients(numElem, allElem);
-
-#if USE_MPI      
-      CommRecv(*domain, MSG_MONOQ, 3,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-#endif      
-
-      /* Calculate velocity gradients */
-      CalcMonotonicQGradientsForElems(domain);
-
-#if USE_MPI      
-      Domain_member fieldData[3] ;
-      
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      fieldData[0] = &Domain::delv_xi ;
-      fieldData[1] = &Domain::delv_eta ;
-      fieldData[2] = &Domain::delv_zeta ;
-
-      CommSend(*domain, MSG_MONOQ, 3, fieldData,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-
-      CommMonoQ(*domain) ;
-#endif      
-
-      CalcMonotonicQForElems(domain) ;
-
-      // Free up memory
-      domain->DeallocateGradients();
-
-      /* Don't allow excessive artificial viscosity */
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain->q(i) > domain->qstop() ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
-#else
-         exit(QStopError);
-#endif
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length, Index_t *regElemList)
-{
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) { 
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-   } );
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) { 
-      Index_t ielem = regElemList[i];
-      
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[ielem] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old, Real_t* e_old, Real_t* q_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t* qq_old, Real_t* ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        Index_t length, Index_t *regElemList)
-{
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-   
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) { 
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
-   } );
-
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) {
-
-      e_new[i] += Real_t(0.5) * work[i];
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Index_t ielem = regElemList[i];
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-      Index_t ielem = regElemList[i];
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-   } );
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(Domain* domain,
-                            Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3,
-                            Index_t len, Index_t *regElemList)
-{
-  
-   RAJA::forall<mat_exec_policy>(0, len, [=] (int i) {
-      Index_t ielem = regElemList[i];
-      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[ielem] * vnewc[ielem] *
-                 bvc[i] * pnewc[i]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      domain->ss(ielem) = ssTmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain* domain, Real_t *vnewc,
-                     Int_t numElemReg, Index_t *regElemList, Int_t rep)
-{
-   Real_t  e_cut = domain->e_cut() ;
-   Real_t  p_cut = domain->p_cut() ;
-   Real_t  ss4o3 = domain->ss4o3() ;
-   Real_t  q_cut = domain->q_cut() ;
-
-   Real_t eosvmax = domain->eosvmax() ;
-   Real_t eosvmin = domain->eosvmin() ;
-   Real_t pmin    = domain->pmin() ;
-   Real_t emin    = domain->emin() ;
-   Real_t rho0    = domain->refdens() ;
-
-   // These temporaries will be of different size for 
-   // each call (due to different sized region element
-   // lists)
-   Real_t *e_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *delvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *compression = Allocate<Real_t>(numElemReg) ;
-   Real_t *compHalfStep = Allocate<Real_t>(numElemReg) ;
-   Real_t *qq_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *ql_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *work = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *e_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *bvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *pbvc = Allocate<Real_t>(numElemReg) ;
- 
-   //loop to add load imbalance based on region number 
-   for(Int_t j = 0; j < rep; j++) {
-      /* compress data, minimal set */
-      RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-         Index_t ielem = regElemList[i];
-         e_old[i] = domain->e(ielem) ;
-         delvc[i] = domain->delv(ielem) ;
-         p_old[i] = domain->p(ielem) ;
-         q_old[i] = domain->q(ielem) ;
-         qq_old[i] = domain->qq(ielem) ;
-         ql_old[i] = domain->ql(ielem) ;
-         work[i] = Real_t(0.) ;
-      } );
-
-      RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-         Index_t ielem = regElemList[i];
-         Real_t vchalf ;
-         compression[i] = Real_t(1.) / vnewc[ielem] - Real_t(1.);
-         vchalf = vnewc[ielem] - delvc[i] * Real_t(.5);
-         compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
-      } );
-
-      /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-            Index_t ielem = regElemList[i];
-            if (vnewc[ielem] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[i] = compression[i] ;
-            }
-         } );
-      }
-
-      if ( eosvmax != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-            Index_t ielem = regElemList[i];
-            if (vnewc[ielem] >= eosvmax) { /* impossible due to calling func? */
-               p_old[i]        = Real_t(0.) ;
-               compression[i]  = Real_t(0.) ;
-               compHalfStep[i] = Real_t(0.) ;
-            }
-         } );
-      }
-
-      CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                         p_old, e_old,  q_old, compression, compHalfStep,
-                         vnewc, work,  delvc, pmin,
-                         p_cut, e_cut, q_cut, emin,
-                         qq_old, ql_old, rho0, eosvmax,
-                         numElemReg, regElemList);
-   }
-
-   RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-      Index_t ielem = regElemList[i];
-      domain->p(ielem) = p_new[i] ;
-      domain->e(ielem) = e_new[i] ;
-      domain->q(ielem) = q_new[i] ;
-   } );
-
-   CalcSoundSpeedForElems(domain,
-                          vnewc, rho0, e_new, p_new,
-                          pbvc, bvc, ss4o3,
-                          numElemReg, regElemList) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&ql_old) ;
-   Release(&qq_old) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&q_old) ;
-   Release(&p_old) ;
-   Release(&delvc) ;
-   Release(&e_old) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin() ;
-    Real_t eosvmax = domain->eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(numElem) ;
-
-    RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-       vnewc[i] = domain->vnew(i) ;
-    } );
-
-    // Bound the updated relative volumes with eosvmin/max
-    if (eosvmin != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-          if (vnewc[i] < eosvmin)
-             vnewc[i] = eosvmin ;
-       } );
-    }
-
-    if (eosvmax != Real_t(0.)) {
-       RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-          if (vnewc[i] > eosvmax)
-             vnewc[i] = eosvmax ;
-       } );
-    }
-
-    // This check may not make perfect sense in LULESH, but
-    // it's representative of something in the full code -
-    // just leave it in, please
-    RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-       Real_t vc = domain->v(i) ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = eosvmin ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = eosvmax ;
-       }
-       if (vc <= 0.) {
-#if USE_MPI             
-          MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-          exit(VolumeError);
-#endif
-       }
-    } );
-
-    for (Int_t r=0 ; r<domain->numReg() ; r++) {
-       Index_t numElemReg = domain->regElemSize(r);
-       Index_t *regElemList = domain->regElemlist(r);
-       Int_t rep;
-       //Determine load imbalance for this region
-       //round down the number with lowest cost
-       if(r < domain->numReg()/2)
-	 rep = 1;
-       //you don't get an expensive region unless you at least have 5 regions
-       else if(r < (domain->numReg() - (domain->numReg()+15)/20))
-         rep = 1 + domain->cost();
-       //very expensive regions
-       else
-	 rep = 10 * (1+ domain->cost());
-       EvalEOSForElems(domain, vnewc, numElemReg, regElemList, rep);
-    }
-
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void UpdateVolumesForElems(Domain* domain,
-                           Real_t v_cut, Index_t length)
-{
-   if (length != 0) {
-      RAJA::forall<range_exec_policy>( int(0), int(length), [=] (int i) {
-         Real_t tmpV = domain->vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-
-         domain->v(i) = tmpV ;
-      } );
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeElements(Domain* domain, Index_t numElem)
-{
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain,
-                        domain->v_cut(), numElem) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(Domain* domain, Index_t length,
-                                   Index_t *regElemlist,
-                                   Real_t qqc, Real_t& dtcourant)
-{
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(dtcourant) ;
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-
-      Index_t indx = regElemlist[i] ;
-      Real_t dtf = domain->ss(indx) * domain->ss(indx) ;
-
-      if ( domain->vdov(indx) < Real_t(0.) ) {
-         dtf += qqc2 * domain->arealg(indx) * domain->arealg(indx) *
-                domain->vdov(indx) * domain->vdov(indx) ;
-      }
-
-      Real_t dtf_cmp = (domain->vdov(indx) != Real_t(0.))
-                     ?  domain->arealg(indx) / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(Domain* domain, Index_t length,
-                                 Index_t *regElemlist, Real_t dvovmax, Real_t& dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(dthydro) ;
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-
-      Index_t indx = regElemlist[i] ;
-
-       Real_t dtvov_cmp = (domain->vdov(indx) != Real_t(0.))
-                        ? (dvovmax / (FABS(domain->vdov(indx))+Real_t(1.e-20)))
-                        : Real_t(1.0e+20) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain* domain) {
-
-   // Initialize conditions to a very large value
-   domain->dtcourant() = 1.0e+20;
-   domain->dthydro() = 1.0e+20;
-
-   for (Index_t r=0 ; r < domain->numReg() ; ++r) {
-      /* evaluate time constraint */
-      CalcCourantConstraintForElems(domain, domain->regElemSize(r),
-                                    domain->regElemlist(r),
-                                    domain->qqc(),
-                                    domain->dtcourant()) ;
-
-      /* check hydro constraint */
-      CalcHydroConstraintForElems(domain, domain->regElemSize(r),
-                                  domain->regElemlist(r),
-                                  domain->dvovmax(),
-                                  domain->dthydro()) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   Domain_member fieldData[6] ;
-#endif
-
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-#endif
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem());
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-   
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif   
-
-   CalcTimeConstraintsForElems(domain);
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommSyncPosVel(*domain) ;
-#endif
-#endif   
-}
-
-
-/******************************************/
-
-int main(int argc, char *argv[])
-{
-   Domain *locDom ;
-   Int_t numRanks ;
-   Int_t myRank ;
-   struct cmdLineOpts opts;
-
-#if USE_MPI   
-   Domain_member fieldData ;
-
-   MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-#else
-   numRanks = 1;
-   myRank = 0;
-#endif   
-
-   /* Set defaults that can be overridden by command line opts */
-   opts.its = 9999999;
-   opts.nx  = 30;
-   opts.numReg = 11;
-   opts.numFiles = (int)(numRanks+10)/9;
-   opts.showProg = 0;
-   opts.quiet = 0;
-   opts.viz = 0;
-   opts.balance = 1;
-   opts.cost = 1;
-
-   ParseCommandLineOptions(argc, argv, myRank, &opts);
-
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      printf("Running problem size %d^3 per domain until completion\n", opts.nx);
-      printf("Num processors: %d\n", numRanks);
-#if USE_OMP
-      printf("Num threads: %d\n", omp_get_max_threads());
-#endif
-      printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
-      printf("To run other sizes, use -s <integer>.\n");
-      printf("To run a fixed number of iterations, use -i <integer>.\n");
-      printf("To run a more or less balanced region set, use -b <integer>.\n");
-      printf("To change the relative costs of regions, use -c <integer>.\n");
-      printf("To print out progress, use -p\n");
-      printf("To write an output file for VisIt, use -v\n");
-      printf("See help (-h) for more options\n\n");
-   }
-
-   // Set up the mesh and decompose. Assumes regular cubes for now
-   Int_t col, row, plane, side;
-   InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
-
-   // Build the main data structure and initialize it
-   locDom = new Domain(numRanks, col, row, plane, opts.nx,
-                       side, opts.numReg, opts.balance, opts.cost) ;
-
-
-#if USE_MPI   
-   fieldData = &Domain::nodalMass ;
-
-   // Initial domain boundary communication 
-   CommRecv(*locDom, MSG_COMM_SBN, 1,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
-            true, false) ;
-   CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
-            true, false) ;
-   CommSBN(*locDom, 1, &fieldData) ;
-
-   // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
-#endif   
-   
-   // BEGIN timestep to solution */
-#ifdef RAJA_USE_CALIPER
-   RAJA::Timer timer_main; 
-   timer_main.start("timer_main");
-#else
-#if USE_MPI   
-   double start = MPI_Wtime();
-#else
-   timeval start;
-   gettimeofday(&start, NULL) ;
-#endif
-#endif
-//debug to see region sizes
-//   for(Int_t i = 0; i < locDom->numReg(); i++)
-//      std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
-
-      TimeIncrement(*locDom) ;
-      LagrangeLeapFrog(locDom) ;
-
-      if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
-      }
-   }
-
-   // Use reduced max elapsed time
-   double elapsed_time;
-#ifdef RAJA_USE_CALIPER
-   // Use reduced max elapsed time
-   timer_main.stop("timer_main");
-   elapsed_time = timer_main.elapsed();
-#else
-#if USE_MPI   
-   elapsed_time = MPI_Wtime() - start;
-#else
-   timeval end;
-   gettimeofday(&end, NULL) ;
-   elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
-#endif
-#endif
-   double elapsed_timeG;
-#if USE_MPI   
-   MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-   elapsed_timeG = elapsed_time;
-#endif
-
-   // Write out final viz file */
-   if (opts.viz) {
-      DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
-   }
-   
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
-   }
-
-   delete locDom;
-
-#if USE_MPI
-   MPI_Finalize() ;
-#endif
-
-   return 0 ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc.src-KEEP_FULLCONVERT b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc.src-KEEP_FULLCONVERT
deleted file mode 100644
index 38c150698..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.cc.src-KEEP_FULLCONVERT
+++ /dev/null
@@ -1,2737 +0,0 @@
-/*
-  This is a Version 2.0 MPI + OpenMP implementation of LULESH
-
-                 Copyright (c) 2010-2013.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 2.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-//////////////
-DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
-* Addition of regions to make work more representative of multi-material codes
-* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
-  more representative of our actual working set sizes
-* Single source distribution supports pure serial, pure OpenMP, MPI-only, 
-  and MPI+OpenMP
-* Addition of ability to visualize the mesh using VisIt 
-  https://wci.llnl.gov/codes/visit/download.html
-* Various command line options (see ./lulesh2.0 -h)
- -q              : quiet mode - suppress stdout
- -i <iterations> : number of cycles to run
- -s <size>       : length of cube mesh along side
- -r <numregions> : Number of distinct regions (def: 11)
- -b <balance>    : Load balance between regions of a domain (def: 1)
- -c <cost>       : Extra cost of more expensive regions (def: 1)
- -f <filepieces> : Number of file parts for viz output (def: np/9)
- -p              : Print out progress
- -v              : Output viz file (requires compiling with -DVIZ_MESH
- -h              : This message
-
- printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-
-*Notable changes in LULESH 2.0
-
-* Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-*
-* The concept of "regions" was added, although every region is the same ideal
-*    gas material, and the same sedov blast wave problem is still the only
-*    problem its hardcoded to solve.
-* Regions allow two things important to making this proxy app more representative:
-*   Four of the LULESH routines are now performed on a region-by-region basis,
-*     making the memory access patterns non-unit stride
-*   Artificial load imbalances can be easily introduced that could impact
-*     parallelization strategies.  
-* The load balance flag changes region assignment.  Region number is raised to
-*   the power entered for assignment probability.  Most likely regions changes
-*   with MPI process id.
-* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
-*   entered multiple. The cost of 5% is 10x the entered multiple.
-* MPI and OpenMP were added, and coalesced into a single version of the source
-*   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-* Added support to write plot files using "poor mans parallel I/O" when linked
-*   with the silo library, which in turn can be read by VisIt.
-* Enabled variable timestep calculation by default (courant condition), which
-*   results in an additional reduction.
-* Default domain (mesh) size reduced from 45^3 to 30^3
-* Command line options to allow numerous test cases without needing to recompile
-* Performance optimizations and code cleanup beyond LULESH 1.0
-* Added a "Figure of Merit" calculation (elements solved per microsecond) and
-*   output in support of using LULESH 2.0 for the 2017 CORAL procurement
-*
-* Possible Differences in Final Release (other changes possible)
-*
-* High Level mesh structure to allow data structure transformations
-* Different default parameters
-* Minor code performance changes and cleanup
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
-//////////////
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <climits>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
-#include <unistd.h>
-
-#if USE_OMP
-# include <omp.h>
-#endif
-
-#include "lulesh.h"
-
-#include "Timer.hxx"
-
-
-
-#define RAJA_STORAGE static inline
-
-//typedef RAJA::seq_exec              Segment_Exec;
-//typedef RAJA::simd_exec             Segment_Exec;
-//typedef RAJA::seq_reduce            reduce_policy;
-
-typedef RAJA::omp_parallel_for_exec Segment_Exec;
-typedef RAJA::omp_reduce            reduce_policy;
-
-//typedef RAJA::cilk_for_exec         Segment_Exec;
-//typedef RAJA::cilk_reduce            reduce_policy;
-
-typedef Segment_Exec node_exec_policy;
-typedef Segment_Exec elem_exec_policy;
-typedef Segment_Exec min_exec_policy;
-typedef Segment_Exec mat_exec_policy;
-typedef Segment_Exec range_exec_policy;
-
-
-/*********************************/
-/* Data structure implementation */
-/*********************************/
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-
-/******************************************/
-
-/* Work Routines */
-
-RAJA_STORAGE
-void TimeIncrement(Domain& domain)
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t gnewdt = Real_t(1.0e+20) ;
-      Real_t newdt ;
-      if (domain.dtcourant() < gnewdt) {
-         gnewdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < gnewdt) {
-         gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-#if USE_MPI      
-      MPI_Allreduce(&gnewdt, &newdt, 1,
-                    ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
-#else
-      newdt = gnewdt;
-#endif
-      
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CollectDomainNodesToElemNodes(Domain* domain,
-                                   const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain->x(nd0i);
-   elemX[1] = domain->x(nd1i);
-   elemX[2] = domain->x(nd2i);
-   elemX[3] = domain->x(nd3i);
-   elemX[4] = domain->x(nd4i);
-   elemX[5] = domain->x(nd5i);
-   elemX[6] = domain->x(nd6i);
-   elemX[7] = domain->x(nd7i);
-
-   elemY[0] = domain->y(nd0i);
-   elemY[1] = domain->y(nd1i);
-   elemY[2] = domain->y(nd2i);
-   elemY[3] = domain->y(nd3i);
-   elemY[4] = domain->y(nd4i);
-   elemY[5] = domain->y(nd5i);
-   elemY[6] = domain->y(nd6i);
-   elemY[7] = domain->y(nd7i);
-
-   elemZ[0] = domain->z(nd0i);
-   elemZ[1] = domain->z(nd1i);
-   elemZ[2] = domain->z(nd2i);
-   elemZ[3] = domain->z(nd3i);
-   elemZ[4] = domain->z(nd4i);
-   elemZ[5] = domain->z(nd5i);
-   elemZ[6] = domain->z(nd6i);
-   elemZ[7] = domain->z(nd7i);
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void InitStressTermsForElems(Domain* domain,
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                             Index_t numElem)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      sigxx[i] = sigyy[i] = sigzz[i] =  - domain->p(i) - domain->q(i) ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemShapeFunctionDerivatives( Real_t const x[],
-                                       Real_t const y[],
-                                       Real_t const z[],
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t* fx, Real_t* fy, Real_t* fz )
-{
-   for(Index_t i = 0; i < 8; i++) {
-      fx[i] = -( stress_xx * B[0][i] );
-      fy[i] = -( stress_yy * B[1][i]  );
-      fz[i] = -( stress_zz * B[2][i] );
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void IntegrateStressForElems( Domain* domain,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ, Index_t numElem, Index_t numNode)
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem;
-   Real_t *fy_elem;
-   Real_t *fz_elem;
-   Real_t fx_local[8] ;
-   Real_t fy_local[8] ;
-   Real_t fz_local[8] ;
-
-   Real_t* tfx_local = fx_local;
-   Real_t* tfy_local = fy_local;
-   Real_t* tfz_local = fz_local;
-
-
-  if (numthreads > 1) {
-     fx_elem = Allocate<Real_t>(numElem8) ;
-     fy_elem = Allocate<Real_t>(numElem8) ;
-     fz_elem = Allocate<Real_t>(numElem8) ;
-  }
-  // loop over all elements
-
-  RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-    const Index_t* const elemToNode = domain->nodelist(k);
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // Volume calculation involves extra work for numerical consistency
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    if (numthreads > 1) {
-       // Eliminate thread writing conflicts at the nodes by giving
-       // each element its own copy to write to
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    &fx_elem[k*8],
-                                    &fy_elem[k*8],
-                                    &fz_elem[k*8] ) ;
-    }
-    else {
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    tfx_local, tfy_local, tfz_local ) ;
-
-       // copy nodal force contributions to global force arrray.
-       for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
-          Index_t gnode = elemToNode[lnode];
-          domain->fx(gnode) += tfx_local[lnode];
-          domain->fy(gnode) += tfy_local[lnode];
-          domain->fz(gnode) += tfz_local[lnode];
-       }
-    }
-  } );
-
-  if (numthreads > 1) {
-     // If threaded, then we need to copy the data out of the temporary
-     // arrays used above into the final forces field
-     RAJA::forall<node_exec_policy>(0, numNode, [=] (int gnode) {
-        Index_t count = domain->nodeElemCount(gnode) ;
-        Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-        Real_t fx_tmp = Real_t(0.0) ;
-        Real_t fy_tmp = Real_t(0.0) ;
-        Real_t fz_tmp = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t ielem = cornerList[i] ;
-           fx_tmp += fx_elem[ielem] ;
-           fy_tmp += fy_elem[ielem] ;
-           fz_tmp += fz_elem[ielem] ;
-        }
-        domain->fx(gnode) = fx_tmp ;
-        domain->fy(gnode) = fy_tmp ;
-        domain->fz(gnode) = fz_tmp ;
-     } );
-     Release(&fz_elem) ;
-     Release(&fy_elem) ;
-     Release(&fx_elem) ;
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
-                              Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Real_t hxx[4];
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
-               hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
-               hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
-               hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfx[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
-               hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
-               hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
-               hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfy[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
-               hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
-               hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
-               hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfz[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcFBHourglassForceForElems( Domain* domain,
-                                   Real_t *determ,
-                                   Real_t *x8n, Real_t *y8n, Real_t *z8n,
-                                   Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
-                                   Real_t hourg, Index_t numElem,
-                                   Index_t numNode)
-{
-
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-  
-   Index_t numElem8 = numElem * 8 ;
-
-   Real_t *fx_elem; 
-   Real_t *fy_elem; 
-   Real_t *fz_elem; 
-
-   if(numthreads > 1) {
-      fx_elem = Allocate<Real_t>(numElem8) ;
-      fy_elem = Allocate<Real_t>(numElem8) ;
-      fz_elem = Allocate<Real_t>(numElem8) ;
-   }
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i2) {
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam[8][4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = domain->nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain->ss(i2);
-      mass1=domain->elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain->xd(n0si2);
-      xd1[1] = domain->xd(n1si2);
-      xd1[2] = domain->xd(n2si2);
-      xd1[3] = domain->xd(n3si2);
-      xd1[4] = domain->xd(n4si2);
-      xd1[5] = domain->xd(n5si2);
-      xd1[6] = domain->xd(n6si2);
-      xd1[7] = domain->xd(n7si2);
-
-      yd1[0] = domain->yd(n0si2);
-      yd1[1] = domain->yd(n1si2);
-      yd1[2] = domain->yd(n2si2);
-      yd1[3] = domain->yd(n3si2);
-      yd1[4] = domain->yd(n4si2);
-      yd1[5] = domain->yd(n5si2);
-      yd1[6] = domain->yd(n6si2);
-      yd1[7] = domain->yd(n7si2);
-
-      zd1[0] = domain->zd(n0si2);
-      zd1[1] = domain->zd(n1si2);
-      zd1[2] = domain->zd(n2si2);
-      zd1[3] = domain->zd(n3si2);
-      zd1[4] = domain->zd(n4si2);
-      zd1[5] = domain->zd(n5si2);
-      zd1[6] = domain->zd(n6si2);
-      zd1[7] = domain->zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      // With the threaded version, we write into local arrays per elem
-      // so we don't have to worry about race conditions
-      if (numthreads > 1) {
-         fx_local = &fx_elem[i3] ;
-         fx_local[0] = hgfx[0];
-         fx_local[1] = hgfx[1];
-         fx_local[2] = hgfx[2];
-         fx_local[3] = hgfx[3];
-         fx_local[4] = hgfx[4];
-         fx_local[5] = hgfx[5];
-         fx_local[6] = hgfx[6];
-         fx_local[7] = hgfx[7];
-
-         fy_local = &fy_elem[i3] ;
-         fy_local[0] = hgfy[0];
-         fy_local[1] = hgfy[1];
-         fy_local[2] = hgfy[2];
-         fy_local[3] = hgfy[3];
-         fy_local[4] = hgfy[4];
-         fy_local[5] = hgfy[5];
-         fy_local[6] = hgfy[6];
-         fy_local[7] = hgfy[7];
-
-         fz_local = &fz_elem[i3] ;
-         fz_local[0] = hgfz[0];
-         fz_local[1] = hgfz[1];
-         fz_local[2] = hgfz[2];
-         fz_local[3] = hgfz[3];
-         fz_local[4] = hgfz[4];
-         fz_local[5] = hgfz[5];
-         fz_local[6] = hgfz[6];
-         fz_local[7] = hgfz[7];
-      }
-      else {
-         domain->fx(n0si2) += hgfx[0];
-         domain->fy(n0si2) += hgfy[0];
-         domain->fz(n0si2) += hgfz[0];
-
-         domain->fx(n1si2) += hgfx[1];
-         domain->fy(n1si2) += hgfy[1];
-         domain->fz(n1si2) += hgfz[1];
-
-         domain->fx(n2si2) += hgfx[2];
-         domain->fy(n2si2) += hgfy[2];
-         domain->fz(n2si2) += hgfz[2];
-
-         domain->fx(n3si2) += hgfx[3];
-         domain->fy(n3si2) += hgfy[3];
-         domain->fz(n3si2) += hgfz[3];
-
-         domain->fx(n4si2) += hgfx[4];
-         domain->fy(n4si2) += hgfy[4];
-         domain->fz(n4si2) += hgfz[4];
-
-         domain->fx(n5si2) += hgfx[5];
-         domain->fy(n5si2) += hgfy[5];
-         domain->fz(n5si2) += hgfz[5];
-
-         domain->fx(n6si2) += hgfx[6];
-         domain->fy(n6si2) += hgfy[6];
-         domain->fz(n6si2) += hgfz[6];
-
-         domain->fx(n7si2) += hgfx[7];
-         domain->fy(n7si2) += hgfy[7];
-         domain->fz(n7si2) += hgfz[7];
-      }
-   } );
-
-   if (numthreads > 1) {
-     // Collect the data from the local arrays into the final force arrays
-      RAJA::forall<node_exec_policy>(0, numNode, [=] (int gnode) {
-         Index_t count = domain->nodeElemCount(gnode) ;
-         Index_t *cornerList = domain->nodeElemCornerList(gnode) ;
-         Real_t fx_tmp = Real_t(0.0) ;
-         Real_t fy_tmp = Real_t(0.0) ;
-         Real_t fz_tmp = Real_t(0.0) ;
-         for (Index_t i=0 ; i < count ; ++i) {
-            Index_t ielem = cornerList[i] ;
-            fx_tmp += fx_elem[ielem] ;
-            fy_tmp += fy_elem[ielem] ;
-            fz_tmp += fz_elem[ielem] ;
-         }
-         domain->fx(gnode) += fx_tmp ;
-         domain->fy(gnode) += fy_tmp ;
-         domain->fz(gnode) += fz_tmp ;
-      } );
-      Release(&fz_elem) ;
-      Release(&fy_elem) ;
-      Release(&fx_elem) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHourglassControlForElems(Domain* domain,
-                                  Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain->numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   // For negative element volume check
-   RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-   /* start loop over elements */
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain->nodelist(i);
-      CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii) {
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain->volo(i) * domain->v(i);
-
-      minvol.min(domain->v(i));
-
-    }
-   ) ;
-
-   if ( Real_t(minvol) <= Real_t(0.0) ) {
-#if USE_MPI         
-      MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-      exit(VolumeError);
-#endif
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, numElem, domain->numNode()) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVolumeForceForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain->hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain,
-                               sigxx, sigyy, sigzz, determ, numElem,
-                               domain->numNode()) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-      RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-         minvol.min(determ[k]);
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-#if USE_MPI            
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE void CalcForceForNodes(Domain* domain)
-{
-  Index_t numNode = domain->numNode() ;
-
-#if USE_MPI  
-  CommRecv(*domain, MSG_COMM_SBN, 3,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-           true, false) ;
-#endif  
-
-  RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     domain->fx(i) = Real_t(0.0) ;
-     domain->fy(i) = Real_t(0.0) ;
-     domain->fz(i) = Real_t(0.0) ;
-  } );
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-#if USE_MPI  
-  Domain_member fieldData[3] ;
-  fieldData[0] = &Domain::fx ;
-  fieldData[1] = &Domain::fy ;
-  fieldData[2] = &Domain::fz ;
-  
-  CommSend(*domain, MSG_COMM_SBN, 3, fieldData,
-           domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() +  1,
-           true, false) ;
-  CommSBN(*domain, 3, fieldData) ;
-#endif  
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcAccelerationForNodes(Domain* domain, Index_t numNode)
-{
-   
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-      domain->xdd(i) = domain->fx(i) / domain->nodalMass(i);
-      domain->ydd(i) = domain->fy(i) / domain->nodalMass(i);
-      domain->zdd(i) = domain->fz(i) / domain->nodalMass(i);
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyAccelerationBoundaryConditionsForNodes(Domain* domain)
-{
-   Index_t size = domain->sizeX();
-   Index_t numNodeBC = (size+1)*(size+1) ;
-
-   if (!domain->symmXempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->xdd(domain->symmX(i)) = Real_t(0.0) ;
-      } );
-   }
-
-   if (!domain->symmYempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->ydd(domain->symmY(i)) = Real_t(0.0) ;
-      } );
-   }
-
-   if (!domain->symmZempty() != 0) {
-      RAJA::forall<range_exec_policy>(int(0), int(numNodeBC), [=] (int i) {
-         domain->zdd(domain->symmZ(i)) = Real_t(0.0) ;
-      } );
-   }
-
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcVelocityForNodes(Domain* domain, const Real_t dt, const Real_t u_cut,
-                          Index_t numNode)
-{
-
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain->xd(i) + domain->xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain->xd(i) = xdtmp ;
-
-     ydtmp = domain->yd(i) + domain->ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain->yd(i) = ydtmp ;
-
-     zdtmp = domain->zd(i) + domain->zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain->zd(i) = zdtmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPositionForNodes(Domain* domain, const Real_t dt, Index_t numNode)
-{
-   RAJA::forall<node_exec_policy>(0, numNode, [=] (int i) {
-     domain->x(i) += domain->xd(i) * dt ;
-     domain->y(i) += domain->yd(i) * dt ;
-     domain->z(i) += domain->zd(i) * dt ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeNodal(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   Domain_member fieldData[6] ;
-#endif
-
-   const Real_t delt = domain->deltatime() ;
-   Real_t u_cut = domain->u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-#if USE_MPI  
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif
-   
-   CalcAccelerationForNodes(domain, domain->numNode());
-   
-   ApplyAccelerationBoundaryConditionsForNodes(domain);
-
-   CalcVelocityForNodes( domain, delt, u_cut, domain->numNode()) ;
-
-   CalcPositionForNodes( domain, delt, domain->numNode() );
-#if USE_MPI
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-  fieldData[0] = &Domain::x ;
-  fieldData[1] = &Domain::y ;
-  fieldData[2] = &Domain::z ;
-  fieldData[3] = &Domain::xd ;
-  fieldData[4] = &Domain::yd ;
-  fieldData[5] = &Domain::zd ;
-
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-   CommSyncPosVel(*domain) ;
-#endif
-#endif
-   
-  return;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-/******************************************/
-
-//inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-/******************************************/
-
-//RAJA_STORAGE
-void CalcKinematicsForElems( Domain* domain,
-                             Real_t deltaTime, Index_t numElem )
-{
-
-  // loop over all elements
-  RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) { 
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain->nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain->volo(k) ;
-    domain->vnew(k) = relativeVolume ;
-    domain->delv(k) = relativeVolume - domain->v(k) ;
-
-    // set characteristic length
-    domain->arealg(k) = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain->xd(gnode);
-      yd_local[lnode] = domain->yd(gnode);
-      zd_local[lnode] = domain->zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGradient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain->dxx(k) = D[0];
-    domain->dyy(k) = D[1];
-    domain->dzz(k) = D[2];
-  } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcLagrangeElements(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain->deltatime() ;
-
-      domain->AllocateStrains(numElem);
-
-      CalcKinematicsForElems(domain, deltatime, numElem) ;
-
-      // check for negative element volume
-      RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-      // element loop to do some stuff not included in the elemlib function.
-      RAJA::forall<elem_exec_policy>(0, numElem, [=] (int k) {
-         // calc strain rate and apply as constraint (only done in FB element)
-         Real_t vdov = domain->dxx(k) + domain->dyy(k) + domain->dzz(k) ;
-         Real_t vdovthird = vdov/Real_t(3.0) ;
-
-         // make the rate of deformation tensor deviatoric
-         domain->vdov(k) = vdov ;
-         domain->dxx(k) -= vdovthird ;
-         domain->dyy(k) -= vdovthird ;
-         domain->dzz(k) -= vdovthird ;
-
-        minvol.min(domain->vnew(k));
-       }
-      ) ;
-
-      if ( Real_t(minvol) <= Real_t(0.0)) {
-#if USE_MPI           
-           MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-           exit(VolumeError);
-#endif
-      }
-
-      domain->DeallocateStrains();
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQGradientsForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem();
-
-   RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain->nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain->x(n0) ;
-      Real_t x1 = domain->x(n1) ;
-      Real_t x2 = domain->x(n2) ;
-      Real_t x3 = domain->x(n3) ;
-      Real_t x4 = domain->x(n4) ;
-      Real_t x5 = domain->x(n5) ;
-      Real_t x6 = domain->x(n6) ;
-      Real_t x7 = domain->x(n7) ;
-
-      Real_t y0 = domain->y(n0) ;
-      Real_t y1 = domain->y(n1) ;
-      Real_t y2 = domain->y(n2) ;
-      Real_t y3 = domain->y(n3) ;
-      Real_t y4 = domain->y(n4) ;
-      Real_t y5 = domain->y(n5) ;
-      Real_t y6 = domain->y(n6) ;
-      Real_t y7 = domain->y(n7) ;
-
-      Real_t z0 = domain->z(n0) ;
-      Real_t z1 = domain->z(n1) ;
-      Real_t z2 = domain->z(n2) ;
-      Real_t z3 = domain->z(n3) ;
-      Real_t z4 = domain->z(n4) ;
-      Real_t z5 = domain->z(n5) ;
-      Real_t z6 = domain->z(n6) ;
-      Real_t z7 = domain->z(n7) ;
-
-      Real_t xv0 = domain->xd(n0) ;
-      Real_t xv1 = domain->xd(n1) ;
-      Real_t xv2 = domain->xd(n2) ;
-      Real_t xv3 = domain->xd(n3) ;
-      Real_t xv4 = domain->xd(n4) ;
-      Real_t xv5 = domain->xd(n5) ;
-      Real_t xv6 = domain->xd(n6) ;
-      Real_t xv7 = domain->xd(n7) ;
-
-      Real_t yv0 = domain->yd(n0) ;
-      Real_t yv1 = domain->yd(n1) ;
-      Real_t yv2 = domain->yd(n2) ;
-      Real_t yv3 = domain->yd(n3) ;
-      Real_t yv4 = domain->yd(n4) ;
-      Real_t yv5 = domain->yd(n5) ;
-      Real_t yv6 = domain->yd(n6) ;
-      Real_t yv7 = domain->yd(n7) ;
-
-      Real_t zv0 = domain->zd(n0) ;
-      Real_t zv1 = domain->zd(n1) ;
-      Real_t zv2 = domain->zd(n2) ;
-      Real_t zv3 = domain->zd(n3) ;
-      Real_t zv4 = domain->zd(n4) ;
-      Real_t zv5 = domain->zd(n5) ;
-      Real_t zv6 = domain->zd(n6) ;
-      Real_t zv7 = domain->zd(n7) ;
-
-      Real_t vol = domain->volo(i)*domain->vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
-      Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
-      Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
-      Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
-      Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
-      Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
-      Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain->delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
-      dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
-      dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
-
-      domain->delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain->delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
-      dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
-      dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
-
-      domain->delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain->delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
-      dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
-      dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
-
-      domain->delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQRegionForElems(Domain* domain, Int_t r,
-                                  Real_t ptiny)
-{
-   Real_t monoq_limiter_mult = domain->monoq_limiter_mult();
-   Real_t monoq_max_slope = domain->monoq_max_slope();
-   Real_t qlc_monoq = domain->qlc_monoq();
-   Real_t qqc_monoq = domain->qqc_monoq();
-
-   RAJA::forall<mat_exec_policy>(0, domain->regElemSize(r), [=] (int i) { 
-      Index_t ielem = domain->regElemlist(r,i);
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = domain->elemBC(ielem) ;
-      Real_t delvm = 0.0, delvp =0.0;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / (domain->delv_xi(ielem)+ ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case XI_M_COMM: /* needs comm data */
-         case 0:         delvm = domain->delv_xi(domain->lxim(ielem)); break ;
-         case XI_M_SYMM: delvm = domain->delv_xi(ielem) ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & XI_P) {
-         case XI_P_COMM: /* needs comm data */
-         case 0:         delvp = domain->delv_xi(domain->lxip(ielem)) ; break ;
-         case XI_P_SYMM: delvp = domain->delv_xi(ielem) ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain->delv_eta(ielem) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case ETA_M_COMM: /* needs comm data */
-         case 0:          delvm = domain->delv_eta(domain->letam(ielem)) ; break ;
-         case ETA_M_SYMM: delvm = domain->delv_eta(ielem) ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ETA_P) {
-         case ETA_P_COMM: /* needs comm data */
-         case 0:          delvp = domain->delv_eta(domain->letap(ielem)) ; break ;
-         case ETA_P_SYMM: delvp = domain->delv_eta(ielem) ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain->delv_zeta(ielem) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case ZETA_M_COMM: /* needs comm data */
-         case 0:           delvm = domain->delv_zeta(domain->lzetam(ielem)) ; break ;
-         case ZETA_M_SYMM: delvm = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ZETA_P) {
-         case ZETA_P_COMM: /* needs comm data */
-         case 0:           delvp = domain->delv_zeta(domain->lzetap(ielem)) ; break ;
-         case ZETA_P_SYMM: delvp = domain->delv_zeta(ielem) ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain->vdov(ielem) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain->delv_xi(ielem)   * domain->delx_xi(ielem)   ;
-         Real_t delvxeta  = domain->delv_eta(ielem)  * domain->delx_eta(ielem)  ;
-         Real_t delvxzeta = domain->delv_zeta(ielem) * domain->delx_zeta(ielem) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain->elemMass(ielem) / (domain->volo(ielem) * domain->vnew(ielem)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain->qq(ielem) = qquad ;
-      domain->ql(ielem) = qlin  ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcMonotonicQForElems(Domain* domain)
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   //
-   // calculate the monotonic q for all regions
-   //
-   for (Index_t r=0 ; r<domain->numReg() ; ++r) {
-      if (domain->regElemSize(r) > 0) {
-         CalcMonotonicQRegionForElems(domain, r, ptiny) ;
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcQForElems(Domain* domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain->numElem() ;
-
-   if (numElem != 0) {
-      Int_t allElem = numElem +  /* local elem */
-            2*domain->sizeX()*domain->sizeY() + /* plane ghosts */
-            2*domain->sizeX()*domain->sizeZ() + /* row ghosts */
-            2*domain->sizeY()*domain->sizeZ() ; /* col ghosts */
-
-      domain->AllocateGradients(numElem, allElem);
-
-#if USE_MPI      
-      CommRecv(*domain, MSG_MONOQ, 3,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-#endif      
-
-      /* Calculate velocity gradients */
-      CalcMonotonicQGradientsForElems(domain);
-
-#if USE_MPI      
-      Domain_member fieldData[3] ;
-      
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      fieldData[0] = &Domain::delv_xi ;
-      fieldData[1] = &Domain::delv_eta ;
-      fieldData[2] = &Domain::delv_zeta ;
-
-      CommSend(*domain, MSG_MONOQ, 3, fieldData,
-               domain->sizeX(), domain->sizeY(), domain->sizeZ(),
-               true, true) ;
-
-      CommMonoQ(*domain) ;
-#endif      
-
-      CalcMonotonicQForElems(domain) ;
-
-      // Free up memory
-      domain->DeallocateGradients();
-
-      /* Don't allow excessive artificial viscosity */
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain->q(i) > domain->qstop() ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
-#else
-         exit(QStopError);
-#endif
-      }
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length, Index_t *regElemList)
-{
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) { 
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-   } );
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) { 
-      Index_t ielem = regElemList[i];
-      
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[ielem] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old, Real_t* e_old, Real_t* q_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t* qq_old, Real_t* ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        Index_t length, Index_t *regElemList)
-{
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-   
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) { 
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
-   } );
-
-   RAJA::forall<range_exec_policy>(0, length, [=] (int i) {
-
-      e_new[i] += Real_t(0.5) * work[i];
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Index_t ielem = regElemList[i];
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   } );
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-      Index_t ielem = regElemList[i];
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-   } );
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcSoundSpeedForElems(Domain* domain,
-                            Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3,
-                            Index_t len, Index_t *regElemList)
-{
-  
-   RAJA::forall<mat_exec_policy>(0, len, [=] (int i) {
-      Index_t ielem = regElemList[i];
-      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[ielem] * vnewc[ielem] *
-                 bvc[i] * pnewc[i]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      domain->ss(ielem) = ssTmp ;
-   } );
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void EvalEOSForElems(Domain* domain, Real_t *vnewc,
-                     Int_t numElemReg, Index_t *regElemList, Int_t rep)
-{
-   Real_t  e_cut = domain->e_cut() ;
-   Real_t  p_cut = domain->p_cut() ;
-   Real_t  ss4o3 = domain->ss4o3() ;
-   Real_t  q_cut = domain->q_cut() ;
-
-   Real_t eosvmax = domain->eosvmax() ;
-   Real_t eosvmin = domain->eosvmin() ;
-   Real_t pmin    = domain->pmin() ;
-   Real_t emin    = domain->emin() ;
-   Real_t rho0    = domain->refdens() ;
-
-   // These temporaries will be of different size for 
-   // each call (due to different sized region element
-   // lists)
-   Real_t *e_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *delvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *compression = Allocate<Real_t>(numElemReg) ;
-   Real_t *compHalfStep = Allocate<Real_t>(numElemReg) ;
-   Real_t *qq_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *ql_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *work = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *e_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *bvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *pbvc = Allocate<Real_t>(numElemReg) ;
- 
-   //loop to add load imbalance based on region number 
-   for(Int_t j = 0; j < rep; j++) {
-      /* compress data, minimal set */
-      RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-         Index_t ielem = regElemList[i];
-         e_old[i] = domain->e(ielem) ;
-         delvc[i] = domain->delv(ielem) ;
-         p_old[i] = domain->p(ielem) ;
-         q_old[i] = domain->q(ielem) ;
-         qq_old[i] = domain->qq(ielem) ;
-         ql_old[i] = domain->ql(ielem) ;
-         work[i] = Real_t(0.) ;
-      } );
-
-      RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-         Index_t ielem = regElemList[i];
-         Real_t vchalf ;
-         compression[i] = Real_t(1.) / vnewc[ielem] - Real_t(1.);
-         vchalf = vnewc[ielem] - delvc[i] * Real_t(.5);
-         compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
-      } );
-
-      /* Check for v > eosvmax or v < eosvmin */
-      if ( eosvmin != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-            Index_t ielem = regElemList[i];
-            if (vnewc[ielem] <= eosvmin) { /* impossible due to calling func? */
-               compHalfStep[i] = compression[i] ;
-            }
-         } );
-      }
-
-      if ( eosvmax != Real_t(0.) ) {
-         RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-            Index_t ielem = regElemList[i];
-            if (vnewc[ielem] >= eosvmax) { /* impossible due to calling func? */
-               p_old[i]        = Real_t(0.) ;
-               compression[i]  = Real_t(0.) ;
-               compHalfStep[i] = Real_t(0.) ;
-            }
-         } );
-      }
-
-      CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                         p_old, e_old,  q_old, compression, compHalfStep,
-                         vnewc, work,  delvc, pmin,
-                         p_cut, e_cut, q_cut, emin,
-                         qq_old, ql_old, rho0, eosvmax,
-                         numElemReg, regElemList);
-   }
-
-   RAJA::forall<mat_exec_policy>(0, numElemReg, [=] (int i) {
-      Index_t ielem = regElemList[i];
-      domain->p(ielem) = p_new[i] ;
-      domain->e(ielem) = e_new[i] ;
-      domain->q(ielem) = q_new[i] ;
-   } );
-
-   CalcSoundSpeedForElems(domain,
-                          vnewc, rho0, e_new, p_new,
-                          pbvc, bvc, ss4o3,
-                          numElemReg, regElemList) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&ql_old) ;
-   Release(&qq_old) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&q_old) ;
-   Release(&p_old) ;
-   Release(&delvc) ;
-   Release(&e_old) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void ApplyMaterialPropertiesForElems(Domain* domain)
-{
-   Index_t numElem = domain->numElem() ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain->eosvmin() ;
-    Real_t eosvmax = domain->eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(numElem) ;
-
-    RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-       vnewc[i] = domain->vnew(i) ;
-
-       // Bound the updated relative volumes with eosvmin/max
-       if (eosvmin != Real_t(0.)) {
-          if (vnewc[i] < eosvmin) {
-             vnewc[i] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-          if (vnewc[i] > eosvmax) {
-             vnewc[i] = eosvmax ;
-          }
-       }
-
-     }
-    ) ;
-
-    // This check may not make perfect sense in LULESH, but
-    // it's representative of something in the full code -
-    // just leave it in, please
-
-    // check for negative element volume
-    RAJA::ReduceMin<reduce_policy, Real_t> minvol(1.0);
-
-    RAJA::forall<elem_exec_policy>(0, numElem, [=] (int i) {
-       Real_t vc = domain->v(i) ;
-       if (eosvmin != Real_t(0.)) {
-          if (vc < eosvmin)
-             vc = eosvmin ;
-       }
-       if (eosvmax != Real_t(0.)) {
-          if (vc > eosvmax)
-             vc = eosvmax ;
-       }
-
-       minvol.min(vc);
-     }
-    ) ;
-
-    if ( Real_t(minvol) <= Real_t(0.) ) {
-#if USE_MPI             
-       MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-       exit(VolumeError);
-#endif
-    }
-
-    for (Int_t r=0 ; r<domain->numReg() ; r++) {
-       Index_t numElemReg = domain->regElemSize(r);
-       Index_t *regElemList = domain->regElemlist(r);
-       Int_t rep;
-       //Determine load imbalance for this region
-       //round down the number with lowest cost
-       if(r < domain->numReg()/2)
-	 rep = 1;
-       //you don't get an expensive region unless you at least have 5 regions
-       else if(r < (domain->numReg() - (domain->numReg()+15)/20))
-         rep = 1 + domain->cost();
-       //very expensive regions
-       else
-	 rep = 10 * (1+ domain->cost());
-       EvalEOSForElems(domain, vnewc, numElemReg, regElemList, rep);
-    }
-
-  }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void UpdateVolumesForElems(Domain* domain,
-                           Real_t v_cut, Index_t length)
-{
-   if (length != 0) {
-      RAJA::forall<range_exec_policy>( int(0), int(length), [=] (int i) {
-         Real_t tmpV = domain->vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-
-         domain->v(i) = tmpV ;
-      } );
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeElements(Domain* domain, Index_t numElem)
-{
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain,
-                        domain->v_cut(), numElem) ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcCourantConstraintForElems(Domain* domain, Index_t length,
-                                   Index_t *regElemlist,
-                                   Real_t qqc, Real_t& dtcourant)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dtcourantLoc(Real_t(1.0e+20)) ;
-   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-
-      Index_t indx = regElemlist[i] ;
-      Real_t dtf = domain->ss(indx) * domain->ss(indx) ;
-
-      if ( domain->vdov(indx) < Real_t(0.) ) {
-         dtf += qqc2 * domain->arealg(indx) * domain->arealg(indx) * 
-                domain->vdov(indx) * domain->vdov(indx) ;
-      }
-
-      Real_t dtf_cmp = (domain->vdov(indx) != Real_t(0.))
-                     ?  domain->arealg(indx) / SQRT(dtf) : Real_t(1.0e+20) ;
-
-      /* determine minimum timestep with its corresponding elem */
-      dtcourantLoc.min(dtf_cmp) ;
-   } ) ;
-
-   /* Don't try to register a time constraint if none of the elements
-    * were active */
-   if (dtcourantLoc < Real_t(1.0e+20)) {
-      dtcourant = dtcourantLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcHydroConstraintForElems(Domain* domain, Index_t length,
-                                 Index_t *regElemlist, Real_t dvovmax, Real_t& dthydro)
-{
-   RAJA::ReduceMin<reduce_policy, Real_t> dthydroLoc(Real_t(1.0e+20)) ;
-
-   RAJA::forall<mat_exec_policy>(0, length, [=] (int i) {
-
-      Index_t indx = regElemlist[i] ;
-
-       Real_t dtvov_cmp = (domain->vdov(indx) != Real_t(0.))
-                        ? (dvovmax / (FABS(domain->vdov(indx))+Real_t(1.e-20)))
-                        : Real_t(1.0e+20) ;
-
-      dthydroLoc.min(dtvov_cmp) ;
-   } ) ;
-
-   if (dthydroLoc < Real_t(1.0e+20)) {
-      dthydro = dthydroLoc ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void CalcTimeConstraintsForElems(Domain* domain) {
-
-   // Initialize conditions to a very large value
-   domain->dtcourant() = 1.0e+20;
-   domain->dthydro() = 1.0e+20;
-
-   for (Index_t r=0 ; r < domain->numReg() ; ++r) {
-      /* evaluate time constraint */
-      CalcCourantConstraintForElems(domain, domain->regElemSize(r),
-                                    domain->regElemlist(r),
-                                    domain->qqc(),
-                                    domain->dtcourant()) ;
-
-      /* check hydro constraint */
-      CalcHydroConstraintForElems(domain, domain->regElemSize(r),
-                                  domain->regElemlist(r),
-                                  domain->dvovmax(),
-                                  domain->dthydro()) ;
-   }
-}
-
-/******************************************/
-
-RAJA_STORAGE
-void LagrangeLeapFrog(Domain* domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   Domain_member fieldData[6] ;
-#endif
-
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-#endif
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain->numElem());
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommRecv(*domain, MSG_SYNC_POS_VEL, 6,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-   
-   CommSend(*domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain->sizeX() + 1, domain->sizeY() + 1, domain->sizeZ() + 1,
-            false, false) ;
-#endif
-#endif   
-
-   CalcTimeConstraintsForElems(domain);
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommSyncPosVel(*domain) ;
-#endif
-#endif   
-}
-
-
-/******************************************/
-
-int main(int argc, char *argv[])
-{
-   Domain *locDom ;
-   Int_t numRanks ;
-   Int_t myRank ;
-   struct cmdLineOpts opts;
-
-#if USE_MPI   
-   Domain_member fieldData ;
-
-   MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-#else
-   numRanks = 1;
-   myRank = 0;
-#endif   
-
-   /* Set defaults that can be overridden by command line opts */
-   opts.its = 9999999;
-   opts.nx  = 30;
-   opts.numReg = 11;
-   opts.numFiles = (int)(numRanks+10)/9;
-   opts.showProg = 0;
-   opts.quiet = 0;
-   opts.viz = 0;
-   opts.balance = 1;
-   opts.cost = 1;
-
-   ParseCommandLineOptions(argc, argv, myRank, &opts);
-
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      printf("Running problem size %d^3 per domain until completion\n", opts.nx);
-      printf("Num processors: %d\n", numRanks);
-#if USE_OMP
-      printf("Num threads: %d\n", omp_get_max_threads());
-#endif
-      printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
-      printf("To run other sizes, use -s <integer>.\n");
-      printf("To run a fixed number of iterations, use -i <integer>.\n");
-      printf("To run a more or less balanced region set, use -b <integer>.\n");
-      printf("To change the relative costs of regions, use -c <integer>.\n");
-      printf("To print out progress, use -p\n");
-      printf("To write an output file for VisIt, use -v\n");
-      printf("See help (-h) for more options\n\n");
-   }
-
-   // Set up the mesh and decompose. Assumes regular cubes for now
-   Int_t col, row, plane, side;
-   InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
-
-   // Build the main data structure and initialize it
-   locDom = new Domain(numRanks, col, row, plane, opts.nx,
-                       side, opts.numReg, opts.balance, opts.cost) ;
-
-
-#if USE_MPI   
-   fieldData = &Domain::nodalMass ;
-
-   // Initial domain boundary communication 
-   CommRecv(*locDom, MSG_COMM_SBN, 1,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
-            true, false) ;
-   CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
-            true, false) ;
-   CommSBN(*locDom, 1, &fieldData) ;
-
-   // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
-#endif   
-   
-   // BEGIN timestep to solution */
-#if USE_MPI   
-   double start = MPI_Wtime();
-#else
-   timeval start;
-   gettimeofday(&start, NULL) ;
-#endif
-//debug to see region sizes
-//   for(Int_t i = 0; i < locDom->numReg(); i++)
-//      std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
-
-      TimeIncrement(*locDom) ;
-      LagrangeLeapFrog(locDom) ;
-
-      if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
-      }
-   }
-
-   // Use reduced max elapsed time
-   double elapsed_time;
-#if USE_MPI   
-   elapsed_time = MPI_Wtime() - start;
-#else
-   timeval end;
-   gettimeofday(&end, NULL) ;
-   elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
-#endif
-   double elapsed_timeG;
-#if USE_MPI   
-   MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-   elapsed_timeG = elapsed_time;
-#endif
-
-   // Write out final viz file */
-   if (opts.viz) {
-      DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
-   }
-   
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
-   }
-
-   delete locDom;
-
-#if USE_MPI
-   MPI_Finalize() ;
-#endif
-
-   return 0 ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.h
deleted file mode 100644
index a66b7f24e..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh.h
+++ /dev/null
@@ -1,632 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-//#define TRY_NO_WAIT
-#undef TRY_NO_WAIT
-
-#if defined(TRY_NO_WAIT)
-#include "RAJAspecial.hxx"
-#endif
-#include "RAJA/RAJA.hxx"
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_x.resize(numNode);  // coordinates
-      m_y.resize(numNode);
-      m_z.resize(numNode);
-
-      m_xd.resize(numNode); // velocities
-      m_yd.resize(numNode);
-      m_zd.resize(numNode);
-
-      m_xdd.resize(numNode); // accelerations
-      m_ydd.resize(numNode);
-      m_zdd.resize(numNode);
-
-      m_fx.resize(numNode);  // forces
-      m_fy.resize(numNode);
-      m_fz.resize(numNode);
-
-      m_nodalMass.resize(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.resize(8*numElem);
-
-      // elem connectivities through face
-      m_lxim.resize(numElem);
-      m_lxip.resize(numElem);
-      m_letam.resize(numElem);
-      m_letap.resize(numElem);
-      m_lzetam.resize(numElem);
-      m_lzetap.resize(numElem);
-
-      m_elemBC.resize(numElem);
-
-      m_e.resize(numElem);
-      m_p.resize(numElem);
-
-      m_q.resize(numElem);
-      m_ql.resize(numElem);
-      m_qq.resize(numElem);
-
-      m_v.resize(numElem);
-
-      m_volo.resize(numElem);
-      m_delv.resize(numElem);
-      m_vdov.resize(numElem);
-
-      m_arealg.resize(numElem);
-
-      m_ss.resize(numElem);
-
-      m_elemMass.resize(numElem);
-    
-      m_vnew.resize(numElem) ;
-   }
-
-   void AllocateGradients(Int_t numElem, Int_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.resize(numElem) ;
-      m_delx_eta.resize(numElem) ;
-      m_delx_zeta.resize(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.resize(allElem) ;
-      m_delv_eta.resize(allElem);
-      m_delv_zeta.resize(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Int_t numElem)
-   {
-      m_dxx.resize(numElem) ;
-      m_dyy.resize(numElem) ;
-      m_dzz.resize(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   // Nodes on symmertry planes
-   Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
-   bool symmXempty()          { return m_symmX.empty(); }
-   bool symmYempty()          { return m_symmY.empty(); }
-   bool symmZempty()          { return m_symmZ.empty(); }
-
-   //
-   // Element-centered
-   //
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
-   void SetupThreadSupportStructures();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void SetupCommBuffers(Int_t edgeNodes);
-   void SetupSymmetryPlanes(Int_t edgeNodes);
-   void SetupElementConnectivities(Int_t edgeElems);
-   void SetupBoundaryConditions(Int_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* Node-centered */
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   // Element-centered
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh_tuple.h b/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh_tuple.h
deleted file mode 100644
index f1c04577e..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_RAJA-variants/LULESH-v2.0_RAJA-basic/lulesh_tuple.h
+++ /dev/null
@@ -1,618 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-#include "RAJA/RAJA.hxx"
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_coord.resize(numNode);  // coordinates
-
-      m_vel.resize(numNode); // velocities
-
-      m_acc.resize(numNode); // accelerations
-
-      m_force.resize(numNode);  // forces
-
-      m_nodalMass.resize(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.resize(8*numElem);
-
-      // elem connectivities through face
-      m_faceToElem.resize(numElem);
-
-      m_elemBC.resize(numElem);
-
-      m_e.resize(numElem);
-
-      m_pq.resize(numElem);
-
-      m_qlqq.resize(numElem);
-
-      m_vol.resize(numElem);
-
-      m_delv.resize(numElem);
-      m_vdov.resize(numElem);
-
-      m_arealg.resize(numElem);
-
-      m_ss.resize(numElem);
-
-      m_elemMass.resize(numElem);
-
-      m_vnew.resize(numElem) ;
-   }
-
-   void AllocateGradients(Int_t numElem, Int_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.resize(numElem) ;
-      m_delx_eta.resize(numElem) ;
-      m_delx_zeta.resize(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.resize(allElem) ;
-      m_delv_eta.resize(allElem);
-      m_delv_zeta.resize(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Int_t numElem)
-   {
-      m_dxx.resize(numElem) ;
-      m_dyy.resize(numElem) ;
-      m_dzz.resize(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_coord[idx].x ; }
-   Real_t& y(Index_t idx)    { return m_coord[idx].y ; }
-   Real_t& z(Index_t idx)    { return m_coord[idx].z ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_vel[idx].x ; }
-   Real_t& yd(Index_t idx)   { return m_vel[idx].y ; }
-   Real_t& zd(Index_t idx)   { return m_vel[idx].z ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_acc[idx].x ; }
-   Real_t& ydd(Index_t idx)  { return m_acc[idx].y ; }
-   Real_t& zdd(Index_t idx)  { return m_acc[idx].z ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_force[idx].x ; }
-   Real_t& fy(Index_t idx)   { return m_force[idx].y ; }
-   Real_t& fz(Index_t idx)   { return m_force[idx].z ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   // Nodes on symmertry planes
-   Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
-   bool symmXempty()          { return m_symmX.empty(); }
-   bool symmYempty()          { return m_symmY.empty(); }
-   bool symmZempty()          { return m_symmZ.empty(); }
-
-   //
-   // Element-centered
-   //
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_faceToElem[idx].lxim ; }
-   Index_t&  lxip(Index_t idx) { return m_faceToElem[idx].lxip ; }
-   Index_t&  letam(Index_t idx) { return m_faceToElem[idx].letam ; }
-   Index_t&  letap(Index_t idx) { return m_faceToElem[idx].letap ; }
-   Index_t&  lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; }
-   Index_t&  lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_pq[idx].p ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_pq[idx].q ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_qlqq[idx].ql ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qlqq[idx].qq ; }
-
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_vol[idx].v ; }
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_vol[idx].volo ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
-   void SetupThreadSupportStructures();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void SetupCommBuffers(Int_t edgeNodes);
-   void SetupSymmetryPlanes(Int_t edgeNodes);
-   void SetupElementConnectivities(Int_t edgeElems);
-   void SetupBoundaryConditions(Int_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* Node-centered */
-
-   struct Tuple3 {
-      Real_t x, y, z ;
-   } ;
-
-   std::vector<Tuple3> m_coord ;  /* coordinates */
-
-   std::vector<Tuple3> m_vel ; /* velocities */
-
-   std::vector<Tuple3> m_acc ; /* accelerations */
-
-   std::vector<Tuple3> m_force ;  /* forces */
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   // Element-centered
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   struct FaceElemConn {
-      Index_t lxim, lxip, letam, letap, lzetam, lzetap ;
-   } ;
-
-   std::vector<FaceElemConn> m_faceToElem ; /* element conn across faces */
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   struct Pcomponents {
-      Real_t p, q ;
-   } ;
-
-   std::vector<Pcomponents> m_pq ;   /* pressure and artificial viscosity */
-
-   struct Qcomponents {
-      Real_t ql, qq ;
-   } ;
-
-   std::vector<Qcomponents> m_qlqq ;  /* linear and quadratic terms for q */
-
-   struct Volume {
-      Real_t v, volo ;
-   } ;
-
-   std::vector<Volume> m_vol ;     /* relative and reference volume */
-
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/CMakeLists.txt b/test/LULESH-v2.0/LULESH-v2.0_baseline/CMakeLists.txt
deleted file mode 100644
index 590a33c50..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For additional details, please also read raja/README-license.txt.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the disclaimer below.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the disclaimer (as noted below) in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the name of the LLNS/LLNL nor the names of its contributors may
-#   be used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-# LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-###############################################################################
-
-if (RAJA_ENABLE_OPENMP)
-add_definitions(-DUSE_OMP=1)
-else()
-add_definitions(-DUSE_OMP=0)
-endif()
-add_definitions(-DUSE_MPI=0)
-
-add_executable(lulesh2.0NORAJA.exe
-  lulesh.cc
-  lulesh-comm.cc
-  lulesh-init.cc
-  lulesh-util.cc
-  lulesh-viz.cc)
-target_link_libraries(lulesh2.0NORAJA.exe ${RT_LIBRARIES})
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/README b/test/LULESH-v2.0/LULESH-v2.0_baseline/README
deleted file mode 100644
index 8b0f260ba..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/README
+++ /dev/null
@@ -1,53 +0,0 @@
-This is the README for LULESH 2.0
-
-More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php
-
-If you have any questions or problems please contact:
-
-Ian Karlin <karlin1@llnl.gov>
-Jeff Keasler <keasler1@llnl.gov> or
-Rob Neely <neely4@llnl.gov>
-
-Also please send any notable results to Ian Karlin <karlin1@llnl.gov> as we are still evaluating the performance of this code.
-
-*** Notable changes in LULESH 2.0 ***
-
-Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-
-The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative:
-
-Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride
-
-Artificial load imbalances can be easily introduced that could impact parallelization strategies.  
-   * The load balance flag changes region assignment.  Region number is raised to the power entered for assignment probability.  Most likely regions changes with MPI process id.
-   * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple.  The cost of 5% is 10x the entered
- multiple.
-
-MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-
-Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt.
-
-Enabled variable timestep calculation by default (courant condition), which results in an additional reduction.  Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size.  Therefore steps to solution will differ from LULESH 1.0.
-
-Default domain (mesh) size reduced from 45^3 to 30^3
-
-Command line options to allow for numerous test cases without needing to recompile
-
-Performance optimizations and code cleanup uncovered during study of LULESH 1.0
-
-Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement
-
-Possible Future 2.0 minor updates (other changes possible as discovered)
-
-* Different default parameters
-* Minor code performance changes and cleanupS
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-comm.cc b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-comm.cc
deleted file mode 100644
index a30c3ec1c..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-comm.cc
+++ /dev/null
@@ -1,1837 +0,0 @@
-#include "lulesh.h"
-
-// If no MPI, then this whole file is stubbed out
-#if USE_MPI
-
-#include <mpi.h>
-#include <string.h>
-
-/* Comm Routines */
-
-#define ALLOW_UNPACKED_PLANE false
-#define ALLOW_UNPACKED_ROW   false
-#define ALLOW_UNPACKED_COL   false
-
-/*
-   There are coherence issues for packing and unpacking message
-   buffers.  Ideally, you would like a lot of threads to 
-   cooperate in the assembly/dissassembly of each message.
-   To do that, each thread should really be operating in a
-   different coherence zone.
-
-   Let's assume we have three fields, f1 through f3, defined on
-   a 61x61x61 cube.  If we want to send the block boundary
-   information for each field to each neighbor processor across
-   each cube face, then we have three cases for the
-   memory layout/coherence of data on each of the six cube
-   boundaries:
-
-      (a) Two of the faces will be in contiguous memory blocks
-      (b) Two of the faces will be comprised of pencils of
-          contiguous memory.
-      (c) Two of the faces will have large strides between
-          every value living on the face.
-
-   How do you pack and unpack this data in buffers to
-   simultaneous achieve the best memory efficiency and
-   the most thread independence?
-
-   Do do you pack field f1 through f3 tighly to reduce message
-   size?  Do you align each field on a cache coherence boundary
-   within the message so that threads can pack and unpack each
-   field independently?  For case (b), do you align each
-   boundary pencil of each field separately?  This increases
-   the message size, but could improve cache coherence so
-   each pencil could be processed independently by a separate
-   thread with no conflicts.
-
-   Also, memory access for case (c) would best be done without
-   going through the cache (the stride is so large it just causes
-   a lot of useless cache evictions).  Is it worth creating
-   a special case version of the packing algorithm that uses
-   non-coherent load/store opcodes?
-*/
-
-/******************************************/
-
-
-/* doRecv flag only works with regular block structure */
-void CommRecv(Domain& domain, int msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.recvRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post receives */
-
-   /* receive data from neighboring domain faces */
-   if (planeMin && doRecv) {
-      /* contiguous memory */
-      int fromRank = myRank - domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (planeMax) {
-      /* contiguous memory */
-      int fromRank = myRank + domain.tp()*domain.tp() ;
-      int recvCount = dx * dy * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMin && doRecv) {
-      /* semi-contiguous memory */
-      int fromRank = myRank - domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (rowMax) {
-      /* semi-contiguous memory */
-      int fromRank = myRank + domain.tp() ;
-      int recvCount = dx * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMin && doRecv) {
-      /* scattered memory */
-      int fromRank = myRank - 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-   if (colMax) {
-      /* scattered memory */
-      int fromRank = myRank + 1 ;
-      int recvCount = dy * dz * xferFields ;
-      MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
-                recvCount, baseType, fromRank, msgType,
-                MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
-      ++pmsg ;
-   }
-
-   if (!planeOnly) {
-      /* receive data from domains connected only by an edge */
-      if (rowMin && colMin && doRecv) {
-         int fromRank = myRank - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax) {
-         int fromRank = myRank + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin) {
-         int fromRank = myRank + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax) {
-         int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax && doRecv) {
-         int fromRank = myRank - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dz * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dx * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin && doRecv) {
-         int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm],
-                   dy * xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      /* receive data from domains connected only by a corner */
-      if (rowMin && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin && doRecv) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin && doRecv) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
-                                         emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL],
-                   xferFields, baseType, fromRank, msgType,
-                   MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-}
-
-/******************************************/
-
-void CommSend(Domain& domain, int msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
-{
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* post recieve buffers for all incoming messages */
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
-   MPI_Status status[26] ;
-   Real_t *destAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   for (Index_t i=0; i<26; ++i) {
-      domain.sendRequest[i] = MPI_REQUEST_NULL ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   /* post sends */
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dy ;
-
-      if (planeMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (planeMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<sendCount; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp()*domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dx * dz ;
-
-      if (rowMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (rowMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + domain.tp(), msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      int sendCount = dy * dz ;
-
-      if (colMin) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank - 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-      if (colMax && doSend) {
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
-               }
-            }
-            destAddr += sendCount ;
-         }
-         destAddr -= xferFields*sendCount ;
-
-         MPI_Isend(destAddr, xferFields*sendCount, baseType,
-                   myRank + 1, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
-         ++pmsg ;
-      }
-   }
-
-   if (!planeOnly) {
-      if (rowMin && colMin) {
-         int toRank = myRank - domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMax && doSend) {
-         int toRank = myRank + domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-              destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && colMin && doSend) {
-         int toRank = myRank + domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMin && planeMax && doSend) {
-         int toRank = myRank + domain.tp()*domain.tp() - 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMax) {
-         int toRank = myRank - domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
-            }
-            destAddr += dz ;
-         }
-         destAddr -= xferFields*dz ;
-         MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dx; ++i) {
-               destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
-            }
-            destAddr += dx ;
-         }
-         destAddr -= xferFields*dx ;
-         MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (colMax && planeMin) {
-         int toRank = myRank - domain.tp()*domain.tp() + 1 ;
-         destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
-                                          emsg * maxEdgeComm] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            Domain_member src = fieldData[fi] ;
-            for (Index_t i=0; i<dy; ++i) {
-               destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
-            }
-            destAddr += dy ;
-         }
-         destAddr -= xferFields*dy ;
-         MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
-         ++emsg ;
-      }
-
-      if (rowMin && colMin && planeMin) {
-         /* corner at domain logical coord (0, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(0) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMin) {
-         /* corner at domain logical coord (1, 0, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMin && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 0, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMin) {
-         /* corner at domain logical coord (0, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMin && planeMax && doSend) {
-         /* corner at domain logical coord (0, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMin) {
-         /* corner at domain logical coord (1, 1, 0) */
-         int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-      if (rowMax && colMax && planeMax && doSend) {
-         /* corner at domain logical coord (1, 1, 1) */
-         int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
-         Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
-                                                emsg * maxEdgeComm +
-                                         cmsg * CACHE_COHERENCE_PAD_REAL] ;
-         Index_t idx = dx*dy*dz - 1 ;
-         for (Index_t fi=0; fi<xferFields; ++fi) {
-            comBuf[fi] = (domain.*fieldData[fi])(idx) ;
-         }
-         MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
-                   MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
-         ++cmsg ;
-      }
-   }
-
-   MPI_Waitall(26, domain.sendRequest, status) ;
-}
-
-/******************************************/
-
-void CommSBN(Domain& domain, int xferFields, Domain_member *fieldData) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   /* summation order should be from smallest value to largest */
-   /* or we could try out kahan summation! */
-
-   int myRank ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
-   if (domain.rowLoc() == 0) {
-      rowMin = 0 ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = 0 ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = 0 ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = 0 ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = 0 ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = 0 ;
-   }
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin & planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax & planeMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin & colMin & planeMin) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMin & planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMin) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin & colMax & planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMin) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMin & planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMin) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax & colMax & planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) += comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommSyncPosVel(Domain& domain) {
-
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   bool doRecv = false ;
-   Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
-   Domain_member fieldData[6] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t emsg = 0 ; /* edge comm msg */
-   Index_t cmsg = 0 ; /* corner comm msg */
-   Index_t dx = domain.sizeX() + 1 ;
-   Index_t dy = domain.sizeY() + 1 ;
-   Index_t dz = domain.sizeZ() + 1 ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dx; ++j) {
-                  (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin && doRecv) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<dz; ++i) {
-               for (Index_t j=0; j<dy; ++j) {
-                  (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
-               }
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin && colMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && colMin) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMin && planeMax) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMin && colMax && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dz; ++i) {
-            (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
-         }
-         srcAddr += dz ;
-      }
-      ++emsg ;
-   }
-
-   if (rowMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dx; ++i) {
-            (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
-         }
-         srcAddr += dx ;
-      }
-      ++emsg ;
-   }
-
-   if (colMax && planeMin && doRecv) {
-      srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                       emsg * maxEdgeComm] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
-      for (Index_t fi=0 ; fi<xferFields; ++fi) {
-         Domain_member dest = fieldData[fi] ;
-         for (Index_t i=0; i<dy; ++i) {
-            (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
-         }
-         srcAddr += dy ;
-      }
-      ++emsg ;
-   }
-
-
-   if (rowMin && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(0) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMin && planeMax) {
-      /* corner at domain logical coord (0, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 0, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMin && colMax && planeMax) {
-      /* corner at domain logical coord (1, 0, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMin && doRecv) {
-      /* corner at domain logical coord (0, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMin && planeMax) {
-      /* corner at domain logical coord (0, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMin && doRecv) {
-      /* corner at domain logical coord (1, 1, 0) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-   if (rowMax && colMax && planeMax) {
-      /* corner at domain logical coord (1, 1, 1) */
-      Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
-                                             emsg * maxEdgeComm +
-                                      cmsg * CACHE_COHERENCE_PAD_REAL] ;
-      Index_t idx = dx*dy*dz - 1 ;
-      MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
-      for (Index_t fi=0; fi<xferFields; ++fi) {
-         (domain.*fieldData[fi])(idx) = comBuf[fi] ;
-      }
-      ++cmsg ;
-   }
-}
-
-/******************************************/
-
-void CommMonoQ(Domain& domain)
-{
-   if (domain.numRanks() == 1)
-      return ;
-
-   int myRank ;
-   Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
-   Domain_member fieldData[3] ;
-   Index_t fieldOffset[3] ;
-   Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
-   Index_t pmsg = 0 ; /* plane comm msg */
-   Index_t dx = domain.sizeX() ;
-   Index_t dy = domain.sizeY() ;
-   Index_t dz = domain.sizeZ() ;
-   MPI_Status status ;
-   Real_t *srcAddr ;
-   bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
-   /* assume communication to 6 neighbors by default */
-   rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
-   if (domain.rowLoc() == 0) {
-      rowMin = false ;
-   }
-   if (domain.rowLoc() == (domain.tp()-1)) {
-      rowMax = false ;
-   }
-   if (domain.colLoc() == 0) {
-      colMin = false ;
-   }
-   if (domain.colLoc() == (domain.tp()-1)) {
-      colMax = false ;
-   }
-   if (domain.planeLoc() == 0) {
-      planeMin = false ;
-   }
-   if (domain.planeLoc() == (domain.tp()-1)) {
-      planeMax = false ;
-   }
-
-   /* point into ghost data area */
-   // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
-   // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
-   // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
-   fieldData[0] = &Domain::delv_xi ;
-   fieldData[1] = &Domain::delv_eta ;
-   fieldData[2] = &Domain::delv_zeta ;
-   fieldOffset[0] = domain.numElem() ;
-   fieldOffset[1] = domain.numElem() ;
-   fieldOffset[2] = domain.numElem() ;
-
-
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-
-   if (planeMin | planeMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dy ;
-
-      if (planeMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (planeMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-
-   if (rowMin | rowMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dx * dz ;
-
-      if (rowMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (rowMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-   if (colMin | colMax) {
-      /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
-      Index_t opCount = dy * dz ;
-
-      if (colMin) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-            fieldOffset[fi] += opCount ;
-         }
-         ++pmsg ;
-      }
-      if (colMax) {
-         /* contiguous memory */
-         srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
-         MPI_Wait(&domain.recvRequest[pmsg], &status) ;
-         for (Index_t fi=0 ; fi<xferFields; ++fi) {
-            Domain_member dest = fieldData[fi] ;
-            for (Index_t i=0; i<opCount; ++i) {
-               (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
-            }
-            srcAddr += opCount ;
-         }
-         ++pmsg ;
-      }
-   }
-}
-
-#endif
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-init.cc b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-init.cc
deleted file mode 100644
index cad79092f..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-init.cc
+++ /dev/null
@@ -1,734 +0,0 @@
-#include <math.h>
-#if USE_MPI
-# include <mpi.h>
-#endif
-#if USE_OMP
-#include <omp.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <cstdlib>
-#include "lulesh.h"
-
-/////////////////////////////////////////////////////////////////////
-Domain::Domain(Int_t numRanks, Index_t colLoc,
-               Index_t rowLoc, Index_t planeLoc,
-               Index_t nx, int tp, int nr, int balance, Int_t cost)
-   :
-   m_e_cut(Real_t(1.0e-7)),
-   m_p_cut(Real_t(1.0e-7)),
-   m_q_cut(Real_t(1.0e-7)),
-   m_v_cut(Real_t(1.0e-10)),
-   m_u_cut(Real_t(1.0e-7)),
-   m_hgcoef(Real_t(3.0)),
-   m_ss4o3(Real_t(4.0)/Real_t(3.0)),
-   m_qstop(Real_t(1.0e+12)),
-   m_monoq_max_slope(Real_t(1.0)),
-   m_monoq_limiter_mult(Real_t(2.0)),
-   m_qlc_monoq(Real_t(0.5)),
-   m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
-   m_qqc(Real_t(2.0)),
-   m_eosvmax(Real_t(1.0e+9)),
-   m_eosvmin(Real_t(1.0e-9)),
-   m_pmin(Real_t(0.)),
-   m_emin(Real_t(-1.0e+15)),
-   m_dvovmax(Real_t(0.1)),
-   m_refdens(Real_t(1.0)),
-//
-// set pointers to (potentially) "new'd" arrays to null to 
-// simplify deallocation.
-//
-   m_regNumList(0),
-   m_nodeElemStart(0),
-   m_nodeElemCornerList(0),
-   m_regElemSize(0),
-   m_regElemlist(0)
-#if USE_MPI
-   , 
-   commDataSend(0),
-   commDataRecv(0)
-#endif
-{
-
-   Index_t edgeElems = nx ;
-   Index_t edgeNodes = edgeElems+1 ;
-   this->cost() = cost;
-
-   m_tp       = tp ;
-   m_numRanks = numRanks ;
-
-   ///////////////////////////////
-   //   Initialize Sedov Mesh
-   ///////////////////////////////
-
-   // construct a uniform box for this processor
-
-   m_colLoc   =   colLoc ;
-   m_rowLoc   =   rowLoc ;
-   m_planeLoc = planeLoc ;
-   
-   m_sizeX = edgeElems ;
-   m_sizeY = edgeElems ;
-   m_sizeZ = edgeElems ;
-   m_numElem = edgeElems*edgeElems*edgeElems ;
-
-   m_numNode = edgeNodes*edgeNodes*edgeNodes ;
-
-   m_regNumList = new Index_t[numElem()] ;  // material indexset
-
-   // Elem-centered 
-   AllocateElemPersistent(numElem()) ;
-
-   // Node-centered 
-   AllocateNodePersistent(numNode()) ;
-
-   SetupCommBuffers(edgeNodes);
-
-   // Basic Field Initialization 
-   for (Index_t i=0; i<numElem(); ++i) {
-      e(i) =  Real_t(0.0) ;
-      p(i) =  Real_t(0.0) ;
-      q(i) =  Real_t(0.0) ;
-      ss(i) = Real_t(0.0) ;
-   }
-
-   // Note - v initializes to 1.0, not 0.0!
-   for (Index_t i=0; i<numElem(); ++i) {
-      v(i) = Real_t(1.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      xd(i) = Real_t(0.0) ;
-      yd(i) = Real_t(0.0) ;
-      zd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      xdd(i) = Real_t(0.0) ;
-      ydd(i) = Real_t(0.0) ;
-      zdd(i) = Real_t(0.0) ;
-   }
-
-   for (Index_t i=0; i<numNode(); ++i) {
-      nodalMass(i) = Real_t(0.0) ;
-   }
-
-   BuildMesh(nx, edgeNodes, edgeElems);
-
-#if USE_OMP
-   SetupThreadSupportStructures();
-#endif
-
-   // Setup region index sets. For now, these are constant sized
-   // throughout the run, but could be changed every cycle to 
-   // simulate effects of ALE on the lagrange solver
-   CreateRegionIndexSets(nr, balance);
-
-   // Setup symmetry nodesets
-   SetupSymmetryPlanes(edgeNodes);
-
-   // Setup element connectivities
-   SetupElementConnectivities(edgeElems);
-
-   // Setup symmetry planes and free surface boundary arrays
-   SetupBoundaryConditions(edgeElems);
-
-
-   // Setup defaults
-
-   // These can be changed (requires recompile) if you want to run
-   // with a fixed timestep, or to a different end time, but it's
-   // probably easier/better to just run a fixed number of timesteps
-   // using the -i flag in 2.x
-
-   dtfixed() = Real_t(-1.0e-6) ; // Negative means use courant condition
-   stoptime()  = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ;
-
-   // Initial conditions
-   deltatimemultlb() = Real_t(1.1) ;
-   deltatimemultub() = Real_t(1.2) ;
-   dtcourant() = Real_t(1.0e+20) ;
-   dthydro()   = Real_t(1.0e+20) ;
-   dtmax()     = Real_t(1.0e-2) ;
-   time()    = Real_t(0.) ;
-   cycle()   = Int_t(0) ;
-
-   // initialize field data 
-   for (Index_t i=0; i<numElem(); ++i) {
-      Real_t x_local[8], y_local[8], z_local[8] ;
-      Index_t *elemToNode = nodelist(i) ;
-      for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-      {
-        Index_t gnode = elemToNode[lnode];
-        x_local[lnode] = x(gnode);
-        y_local[lnode] = y(gnode);
-        z_local[lnode] = z(gnode);
-      }
-
-      // volume calculations
-      Real_t volume = CalcElemVolume(x_local, y_local, z_local );
-      volo(i) = volume ;
-      elemMass(i) = volume ;
-      for (Index_t j=0; j<8; ++j) {
-         Index_t idx = elemToNode[j] ;
-         nodalMass(idx) += volume / Real_t(8.0) ;
-      }
-   }
-
-   // deposit initial energy
-   // An energy of 3.948746e+7 is correct for a problem with
-   // 45 zones along a side - we need to scale it
-   const Real_t ebase = Real_t(3.948746e+7);
-   Real_t scale = (nx*m_tp)/Real_t(45.0);
-   Real_t einit = ebase*scale*scale*scale;
-   if (m_rowLoc + m_colLoc + m_planeLoc == 0) {
-      // Dump into the first zone (which we know is in the corner)
-      // of the domain that sits at the origin
-      e(0) = einit;
-   }
-   //set initial deltatime base on analytic CFL calculation
-   deltatime() = (Real_t(.5)*cbrt(volo(0)))/sqrt(Real_t(2.0)*einit);
-
-} // End constructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-Domain::~Domain()
-{
-   delete [] m_regNumList;
-   delete [] m_nodeElemStart;
-   delete [] m_nodeElemCornerList;
-   delete [] m_regElemSize;
-   for (Index_t i=0 ; i<numReg() ; ++i) {
-     delete [] m_regElemlist[i];
-   }
-   delete [] m_regElemlist;
-   
-#if USE_MPI
-   delete [] commDataSend;
-   delete [] commDataRecv;
-#endif
-} // End destructor
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems)
-{
-  Index_t meshEdgeElems = m_tp*nx ;
-
-  // initialize nodal coordinates 
-  Index_t nidx = 0 ;
-  Real_t tz = Real_t(1.125)*Real_t(m_planeLoc*nx)/Real_t(meshEdgeElems) ;
-  for (Index_t plane=0; plane<edgeNodes; ++plane) {
-    Real_t ty = Real_t(1.125)*Real_t(m_rowLoc*nx)/Real_t(meshEdgeElems) ;
-    for (Index_t row=0; row<edgeNodes; ++row) {
-      Real_t tx = Real_t(1.125)*Real_t(m_colLoc*nx)/Real_t(meshEdgeElems) ;
-      for (Index_t col=0; col<edgeNodes; ++col) {
-	x(nidx) = tx ;
-	y(nidx) = ty ;
-	z(nidx) = tz ;
-	++nidx ;
-	// tx += ds ; // may accumulate roundoff... 
-	tx = Real_t(1.125)*Real_t(m_colLoc*nx+col+1)/Real_t(meshEdgeElems) ;
-      }
-      // ty += ds ;  // may accumulate roundoff... 
-      ty = Real_t(1.125)*Real_t(m_rowLoc*nx+row+1)/Real_t(meshEdgeElems) ;
-    }
-    // tz += ds ;  // may accumulate roundoff... 
-    tz = Real_t(1.125)*Real_t(m_planeLoc*nx+plane+1)/Real_t(meshEdgeElems) ;
-  }
-
-
-  // embed hexehedral elements in nodal point lattice 
-  Index_t zidx = 0 ;
-  nidx = 0 ;
-  for (Index_t plane=0; plane<edgeElems; ++plane) {
-    for (Index_t row=0; row<edgeElems; ++row) {
-      for (Index_t col=0; col<edgeElems; ++col) {
-	Index_t *localNode = nodelist(zidx) ;
-	localNode[0] = nidx                                       ;
-	localNode[1] = nidx                                   + 1 ;
-	localNode[2] = nidx                       + edgeNodes + 1 ;
-	localNode[3] = nidx                       + edgeNodes     ;
-	localNode[4] = nidx + edgeNodes*edgeNodes                 ;
-	localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
-	localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
-	localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
-	++zidx ;
-	++nidx ;
-      }
-      ++nidx ;
-    }
-    nidx += edgeNodes ;
-  }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupThreadSupportStructures()
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-  if (numthreads > 1) {
-    // set up node-centered indexing of elements 
-    Index_t *nodeElemCount = new Index_t[numNode()] ;
-
-    for (Index_t i=0; i<numNode(); ++i) {
-      nodeElemCount[i] = 0 ;
-    }
-
-    for (Index_t i=0; i<numElem(); ++i) {
-      Index_t *nl = nodelist(i) ;
-      for (Index_t j=0; j < 8; ++j) {
-	++(nodeElemCount[nl[j]] );
-      }
-    }
-
-    m_nodeElemStart = new Index_t[numNode()+1] ;
-
-    m_nodeElemStart[0] = 0;
-
-    for (Index_t i=1; i <= numNode(); ++i) {
-      m_nodeElemStart[i] =
-	m_nodeElemStart[i-1] + nodeElemCount[i-1] ;
-    }
-       
-    m_nodeElemCornerList = new Index_t[m_nodeElemStart[numNode()]];
-
-    for (Index_t i=0; i < numNode(); ++i) {
-      nodeElemCount[i] = 0;
-    }
-
-    for (Index_t i=0; i < numElem(); ++i) {
-      Index_t *nl = nodelist(i) ;
-      for (Index_t j=0; j < 8; ++j) {
-	Index_t m = nl[j];
-	Index_t k = i*8 + j ;
-	Index_t offset = m_nodeElemStart[m] + nodeElemCount[m] ;
-	m_nodeElemCornerList[offset] = k;
-	++(nodeElemCount[m]) ;
-      }
-    }
-
-    Index_t clSize = m_nodeElemStart[numNode()] ;
-    for (Index_t i=0; i < clSize; ++i) {
-      Index_t clv = m_nodeElemCornerList[i] ;
-      if ((clv < 0) || (clv > numElem()*8)) {
-	fprintf(stderr,
-		"AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
-#if USE_MPI
-	MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-	exit(-1);
-#endif
-      }
-    }
-
-    delete [] nodeElemCount ;
-  }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::SetupCommBuffers(Int_t edgeNodes)
-{
-  // allocate a buffer large enough for nodal ghost data 
-  Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ;
-  m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ;
-  m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ;
-
-  // assume communication to 6 neighbors by default 
-  m_rowMin = (m_rowLoc == 0)        ? 0 : 1;
-  m_rowMax = (m_rowLoc == m_tp-1)     ? 0 : 1;
-  m_colMin = (m_colLoc == 0)        ? 0 : 1;
-  m_colMax = (m_colLoc == m_tp-1)     ? 0 : 1;
-  m_planeMin = (m_planeLoc == 0)    ? 0 : 1;
-  m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1;
-
-#if USE_MPI   
-  // account for face communication 
-  Index_t comBufSize =
-    (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
-    m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for edge communication 
-  comBufSize +=
-    ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
-     (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
-     (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
-     (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
-    m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
-
-  // account for corner communication 
-  // factor of 16 is so each buffer has its own cache line 
-  comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
-		 (m_rowMin & m_colMin & m_planeMax) +
-		 (m_rowMin & m_colMax & m_planeMin) +
-		 (m_rowMin & m_colMax & m_planeMax) +
-		 (m_rowMax & m_colMin & m_planeMin) +
-		 (m_rowMax & m_colMin & m_planeMax) +
-		 (m_rowMax & m_colMax & m_planeMin) +
-		 (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
-
-  this->commDataSend = new Real_t[comBufSize] ;
-  this->commDataRecv = new Real_t[comBufSize] ;
-  // prevent floating point exceptions 
-  memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ;
-  memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ;
-#endif   
-
-  // Boundary nodesets
-  AllocateSymmetry(edgeNodes*edgeNodes) ;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-void
-Domain::CreateRegionIndexSets(Int_t nr, Int_t balance)
-{
-#if USE_MPI   
-   Index_t myRank;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-   srand(myRank);
-#else
-   srand(0);
-   Index_t myRank = 0;
-#endif
-   this->numReg() = nr;
-   m_regElemSize = new Index_t[numReg()];
-   m_regElemlist = new Index_t*[numReg()];
-   Index_t nextIndex = 0;
-   //if we only have one region just fill it
-   // Fill out the regNumList with material numbers, which are always
-   // the region index plus one 
-   if(numReg() == 1) {
-      while (nextIndex < numElem()) {
-	 this->regNumList(nextIndex) = 1;
-         nextIndex++;
-      }
-      regElemSize(0) = 0;
-   }
-   //If we have more than one region distribute the elements.
-   else {
-      Int_t regionNum;
-      Int_t regionVar;
-      Int_t lastReg = -1;
-      Int_t binSize;
-      Index_t elements;
-      Index_t runto = 0;
-      Int_t costDenominator = 0;
-      Int_t* regBinEnd = new Int_t[numReg()];
-      //Determine the relative weights of all the regions.  This is based off the -b flag.  Balance is the value passed into b.  
-      for (Index_t i=0 ; i<numReg() ; ++i) {
-         regElemSize(i) = 0;
-	 costDenominator += pow((i+1), balance);  //Total sum of all regions weights
-	 regBinEnd[i] = costDenominator;  //Chance of hitting a given region is (regBinEnd[i] - regBinEdn[i-1])/costDenominator
-      }
-      //Until all elements are assigned
-      while (nextIndex < numElem()) {
-	 //pick the region
-	 regionVar = rand() % costDenominator;
-	 Index_t i = 0;
-         while(regionVar >= regBinEnd[i])
-	    i++;
-         //rotate the regions based on MPI rank.  Rotation is Rank % NumRegions this makes each domain have a different region with 
-         //the highest representation
-	 regionNum = ((i + myRank) % numReg()) + 1;
-	 // make sure we don't pick the same region twice in a row
-         while(regionNum == lastReg) {
-	    regionVar = rand() % costDenominator;
-	    i = 0;
-            while(regionVar >= regBinEnd[i])
-	       i++;
-	    regionNum = ((i + myRank) % numReg()) + 1;
-         }
-	 //Pick the bin size of the region and determine the number of elements.
-         binSize = rand() % 1000;
-	 if(binSize < 773) {
-	   elements = rand() % 15 + 1;
-	 }
-	 else if(binSize < 937) {
-	   elements = rand() % 16 + 16;
-	 }
-	 else if(binSize < 970) {
-	   elements = rand() % 32 + 32;
-	 }
-	 else if(binSize < 974) {
-	   elements = rand() % 64 + 64;
-	 } 
-	 else if(binSize < 978) {
-	   elements = rand() % 128 + 128;
-	 }
-	 else if(binSize < 981) {
-	   elements = rand() % 256 + 256;
-	 }
-	 else
-	    elements = rand() % 1537 + 512;
-	 runto = elements + nextIndex;
-	 //Store the elements.  If we hit the end before we run out of elements then just stop.
-         while (nextIndex < runto && nextIndex < numElem()) {
-	    this->regNumList(nextIndex) = regionNum;
-	    nextIndex++;
-	 }
-	 lastReg = regionNum;
-      }
-
-      delete [] regBinEnd; 
-   }
-   // Convert regNumList to region index sets
-   // First, count size of each region 
-   for (Index_t i=0 ; i<numElem() ; ++i) {
-      int r = this->regNumList(i)-1; // region index == regnum-1
-      regElemSize(r)++;
-   }
-   // Second, allocate each region index set
-   for (Index_t i=0 ; i<numReg() ; ++i) {
-      m_regElemlist[i] = new Index_t[regElemSize(i)];
-      regElemSize(i) = 0;
-   }
-   // Third, fill index sets
-   for (Index_t i=0 ; i<numElem() ; ++i) {
-      Index_t r = regNumList(i)-1;       // region index == regnum-1
-      Index_t regndx = regElemSize(r)++; // Note increment
-      regElemlist(r,regndx) = i;
-   }
-   
-}
-
-/////////////////////////////////////////////////////////////
-void 
-Domain::SetupSymmetryPlanes(Int_t edgeNodes)
-{
-  Index_t nidx = 0 ;
-  for (Index_t i=0; i<edgeNodes; ++i) {
-    Index_t planeInc = i*edgeNodes*edgeNodes ;
-    Index_t rowInc   = i*edgeNodes ;
-    for (Index_t j=0; j<edgeNodes; ++j) {
-      if (m_planeLoc == 0) {
-	m_symmZ[nidx] = rowInc   + j ;
-      }
-      if (m_rowLoc == 0) {
-	m_symmY[nidx] = planeInc + j ;
-      }
-      if (m_colLoc == 0) {
-	m_symmX[nidx] = planeInc + j*edgeNodes ;
-      }
-      ++nidx ;
-    }
-  }
-}
-
-
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupElementConnectivities(Int_t edgeElems)
-{
-   lxim(0) = 0 ;
-   for (Index_t i=1; i<numElem(); ++i) {
-      lxim(i)   = i-1 ;
-      lxip(i-1) = i ;
-   }
-   lxip(numElem()-1) = numElem()-1 ;
-
-   for (Index_t i=0; i<edgeElems; ++i) {
-      letam(i) = i ; 
-      letap(numElem()-edgeElems+i) = numElem()-edgeElems+i ;
-   }
-   for (Index_t i=edgeElems; i<numElem(); ++i) {
-      letam(i) = i-edgeElems ;
-      letap(i-edgeElems) = i ;
-   }
-
-   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
-      lzetam(i) = i ;
-      lzetap(numElem()-edgeElems*edgeElems+i) = numElem()-edgeElems*edgeElems+i ;
-   }
-   for (Index_t i=edgeElems*edgeElems; i<numElem(); ++i) {
-      lzetam(i) = i - edgeElems*edgeElems ;
-      lzetap(i-edgeElems*edgeElems) = i ;
-   }
-}
-
-/////////////////////////////////////////////////////////////
-void
-Domain::SetupBoundaryConditions(Int_t edgeElems) 
-{
-  Index_t ghostIdx[6] ;  // offsets to ghost locations
-
-  // set up boundary condition information
-  for (Index_t i=0; i<numElem(); ++i) {
-     elemBC(i) = Int_t(0) ;
-  }
-
-  for (Index_t i=0; i<6; ++i) {
-    ghostIdx[i] = INT_MIN ;
-  }
-
-  Int_t pidx = numElem() ;
-  if (m_planeMin != 0) {
-    ghostIdx[0] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_planeMax != 0) {
-    ghostIdx[1] = pidx ;
-    pidx += sizeX()*sizeY() ;
-  }
-
-  if (m_rowMin != 0) {
-    ghostIdx[2] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_rowMax != 0) {
-    ghostIdx[3] = pidx ;
-    pidx += sizeX()*sizeZ() ;
-  }
-
-  if (m_colMin != 0) {
-    ghostIdx[4] = pidx ;
-    pidx += sizeY()*sizeZ() ;
-  }
-
-  if (m_colMax != 0) {
-    ghostIdx[5] = pidx ;
-  }
-
-  // symmetry plane or free surface BCs 
-  for (Index_t i=0; i<edgeElems; ++i) {
-    Index_t planeInc = i*edgeElems*edgeElems ;
-    Index_t rowInc   = i*edgeElems ;
-    for (Index_t j=0; j<edgeElems; ++j) {
-      if (m_planeLoc == 0) {
-	elemBC(rowInc+j) |= ZETA_M_SYMM ;
-      }
-      else {
-	elemBC(rowInc+j) |= ZETA_M_COMM ;
-	lzetam(rowInc+j) = ghostIdx[0] + rowInc + j ;
-      }
-
-      if (m_planeLoc == m_tp-1) {
-	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-	  ZETA_P_FREE;
-      }
-      else {
-	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
-	  ZETA_P_COMM ;
-	lzetap(rowInc+j+numElem()-edgeElems*edgeElems) =
-	  ghostIdx[1] + rowInc + j ;
-      }
-
-      if (m_rowLoc == 0) {
-	elemBC(planeInc+j) |= ETA_M_SYMM ;
-      }
-      else {
-	elemBC(planeInc+j) |= ETA_M_COMM ;
-	letam(planeInc+j) = ghostIdx[2] + rowInc + j ;
-      }
-
-      if (m_rowLoc == m_tp-1) {
-	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-	  ETA_P_FREE ;
-      }
-      else {
-	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
-	  ETA_P_COMM ;
-	letap(planeInc+j+edgeElems*edgeElems-edgeElems) =
-	  ghostIdx[3] +  rowInc + j ;
-      }
-
-      if (m_colLoc == 0) {
-	elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
-      }
-      else {
-	elemBC(planeInc+j*edgeElems) |= XI_M_COMM ;
-	lxim(planeInc+j*edgeElems) = ghostIdx[4] + rowInc + j ;
-      }
-
-      if (m_colLoc == m_tp-1) {
-	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
-      }
-      else {
-	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_COMM ;
-	lxip(planeInc+j*edgeElems+edgeElems-1) =
-	  ghostIdx[5] + rowInc + j ;
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side)
-{
-   Int_t testProcs;
-   Int_t dx, dy, dz;
-   Int_t myDom;
-   
-   // Assume cube processor layout for now 
-   testProcs = Int_t(cbrt(Real_t(numRanks))+0.5) ;
-   if (testProcs*testProcs*testProcs != numRanks) {
-      printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (sizeof(Real_t) != 4 && sizeof(Real_t) != 8) {
-      printf("MPI operations only support float and double right now...\n");
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   if (MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) {
-      printf("corner element comm buffers too small.  Fix code.\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-
-   dx = testProcs ;
-   dy = testProcs ;
-   dz = testProcs ;
-
-   // temporary test
-   if (dx*dy*dz != numRanks) {
-      printf("error -- must have as many domains as procs\n") ;
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1) ;
-#else
-      exit(-1);
-#endif
-   }
-   Int_t remainder = dx*dy*dz % numRanks ;
-   if (myRank < remainder) {
-      myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ;
-   }
-   else {
-      myDom = remainder*( 1+ (dx*dy*dz / numRanks)) +
-         (myRank - remainder)*(dx*dy*dz/numRanks) ;
-   }
-
-   *col = myDom % dx ;
-   *row = (myDom / dx) % dy ;
-   *plane = myDom / (dx*dy) ;
-   *side = testProcs;
-
-   return;
-}
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-util.cc b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-util.cc
deleted file mode 100644
index bdade86d9..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-util.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#if USE_MPI
-#include <mpi.h>
-#endif
-#include "lulesh.h"
-
-/* Helper function for converting strings to ints, with error checking */
-int StrToInt(const char *token, int *retVal)
-{
-   const char *c ;
-   char *endptr ;
-   const int decimal_base = 10 ;
-
-   if (token == NULL)
-      return 0 ;
-   
-   c = token ;
-   *retVal = (int)strtol(c, &endptr, decimal_base) ;
-   if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0')))
-      return 1 ;
-   else
-      return 0 ;
-}
-
-static void PrintCommandLineOptions(char *execname, int myRank)
-{
-   if (myRank == 0) {
-
-      printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-   }
-}
-
-static void ParseError(const char *message, int myRank)
-{
-   if (myRank == 0) {
-      printf("%s\n", message);
-#if USE_MPI      
-      MPI_Abort(MPI_COMM_WORLD, -1);
-#else
-      exit(-1);
-#endif
-   }
-}
-
-void ParseCommandLineOptions(int argc, char *argv[],
-                             int myRank, struct cmdLineOpts *opts)
-{
-   if(argc > 1) {
-      int i = 1;
-
-      while(i < argc) {
-         int ok;
-         /* -i <iterations> */
-         if(strcmp(argv[i], "-i") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -i", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->its));
-            if(!ok) {
-               ParseError("Parse Error on option -i integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -s <size, sidelength> */
-         else if(strcmp(argv[i], "-s") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -s\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->nx));
-            if(!ok) {
-               ParseError("Parse Error on option -s integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -r <numregions> */
-         else if (strcmp(argv[i], "-r") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -r\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numReg));
-            if (!ok) {
-               ParseError("Parse Error on option -r integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-	 /* -f <numfilepieces> */
-         else if (strcmp(argv[i], "-f") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -f\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->numFiles));
-            if (!ok) {
-               ParseError("Parse Error on option -f integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -p */
-         else if (strcmp(argv[i], "-p") == 0) {
-            opts->showProg = 1;
-            i++;
-         }
-         /* -q */
-         else if (strcmp(argv[i], "-q") == 0) {
-            opts->quiet = 1;
-            i++;
-         }
-         else if (strcmp(argv[i], "-b") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -b\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->balance));
-            if (!ok) {
-               ParseError("Parse Error on option -b integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         else if (strcmp(argv[i], "-c") == 0) {
-            if (i+1 >= argc) {
-               ParseError("Missing integer argument to -c\n", myRank);
-            }
-            ok = StrToInt(argv[i+1], &(opts->cost));
-            if (!ok) {
-               ParseError("Parse Error on option -c integer value required after argument\n", myRank);
-            }
-            i+=2;
-         }
-         /* -v */
-         else if (strcmp(argv[i], "-v") == 0) {
-#if VIZ_MESH            
-            opts->viz = 1;
-#else
-            ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank);
-#endif
-            i++;
-         }
-         /* -h */
-         else if (strcmp(argv[i], "-h") == 0) {
-            PrintCommandLineOptions(argv[0], myRank);
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, 0);
-#else
-            exit(0);
-#endif
-         }
-         else {
-            char msg[80];
-            PrintCommandLineOptions(argv[0], myRank);
-            sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]);
-            ParseError(msg, myRank);
-         }
-      }
-   }
-}
-
-/////////////////////////////////////////////////////////////////////
-
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks)
-{
-   // GrindTime1 only takes a single domain into account, and is thus a good way to measure
-   // processor speed indepdendent of MPI parallelism.
-   // GrindTime2 takes into account speedups from MPI parallelism 
-   Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx);
-   Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx*nx*nx*numRanks);
-
-   Index_t ElemId = 0;
-   printf("Run completed:  \n");
-   printf("   Problem size        =  %i \n",    nx);
-   printf("   MPI tasks           =  %i \n",    numRanks);
-   printf("   Iteration count     =  %i \n",    locDom.cycle());
-   printf("   Final Origin Energy = %12.6e \n", locDom.e(ElemId));
-
-   Real_t   MaxAbsDiff = Real_t(0.0);
-   Real_t TotalAbsDiff = Real_t(0.0);
-   Real_t   MaxRelDiff = Real_t(0.0);
-
-   for (Index_t j=0; j<nx; ++j) {
-      for (Index_t k=j+1; k<nx; ++k) {
-         Real_t AbsDiff = FABS(locDom.e(j*nx+k)-locDom.e(k*nx+j));
-         TotalAbsDiff  += AbsDiff;
-
-         if (MaxAbsDiff <AbsDiff) {
-            MaxAbsDiff = AbsDiff;
-         }
-
-         if (locDom.e(k*nx+j) != 0.0) {
-            Real_t RelDiff = AbsDiff / locDom.e(k*nx+j);
-            if (MaxRelDiff <RelDiff) {
-               MaxRelDiff = RelDiff;
-            }
-         }
-      }
-   }
-
-   // Quick symmetry check
-   printf("   Testing Plane 0 of Energy Array on rank 0:\n");
-   printf("        MaxAbsDiff   = %12.6e\n",   MaxAbsDiff   );
-   printf("        TotalAbsDiff = %12.6e\n",   TotalAbsDiff );
-   printf("        MaxRelDiff   = %12.6e\n\n", MaxRelDiff   );
-
-   // Timing information
-   printf("\nElapsed time         = %10.2f (s)\n", elapsed_time);
-   printf("Grind time (us/z/c)  = %10.8g (per dom)  (%10.8g overall)\n", grindTime1, grindTime2);
-   printf("FOM                  = %10.8g (z/s)\n\n", 1000.0/grindTime2); // zones per second
-
-   return ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-viz.cc b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-viz.cc
deleted file mode 100644
index f0d1f36e4..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh-viz.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "lulesh.h"
-
-#if defined(VIZ_MESH)
-
-#ifdef __cplusplus
-  extern "C" {
-#endif
-#include "silo.h"
-#if USE_MPI
-# include "pmpio.h"
-#endif
-#ifdef __cplusplus
-  }
-#endif
-
-// Function prototypes
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank);
-static
-
-
-#if USE_MPI
-// For some reason, earlier versions of g++ (e.g. 4.2) won't let me
-// put the 'static' qualifier on this prototype, even if it's done
-// consistently in the prototype and definition
-void
-DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                      char basename[], int numRanks);
-
-// Callback prototypes for PMPIO interface (only useful if we're
-// running parallel)
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata);
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata);
-static void
-LULESH_PMPIO_Close(void *file, void *udata);
-
-#else
-void
-DumpMultiblockObjects(DBfile *db, char basename[], int numRanks);
-#endif
-
-
-/**********************************************************************/
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 
-{
-  char subdirName[32];
-  char basename[32];
-  DBfile *db;
-
-
-  sprintf(basename, "lulesh_plot_c%d", domain.cycle());
-  sprintf(subdirName, "data_%d", myRank);
-
-#if USE_MPI
-
-  PMPIO_baton_t *bat = PMPIO_Init(numFiles,
-				  PMPIO_WRITE,
-				  MPI_COMM_WORLD,
-				  10101,
-				  LULESH_PMPIO_Create,
-				  LULESH_PMPIO_Open,
-				  LULESH_PMPIO_Close,
-				  NULL);
-
-  int myiorank = PMPIO_GroupRank(bat, myRank);
-
-  char fileName[64];
-  
-  if (myiorank == 0) 
-    strcpy(fileName, basename);
-  else
-    sprintf(fileName, "%s.%03d", basename, myiorank);
-
-  db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName);
-
-  DumpDomainToVisit(db, domain, myRank);
-
-  // Processor 0 writes out bit of extra data to its file that
-  // describes how to stitch all the pieces together
-  if (myRank == 0) {
-    DumpMultiblockObjects(db, bat, basename, numRanks);
-  }
-
-  PMPIO_HandOffBaton(bat, db);
-
-  PMPIO_Finish(bat);
-#else
-
-  db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-  if (db) {
-     DBMkDir(db, subdirName);
-     DBSetDir(db, subdirName);
-     DumpDomainToVisit(db, domain, myRank);
-     DumpMultiblockObjects(db, basename, numRanks);
-  }
-  else {
-     printf("Error writing out viz file - rank %d\n", myRank);
-  }
-
-#endif
-}
-
-
-
-/**********************************************************************/
-
-static void 
-DumpDomainToVisit(DBfile *db, Domain& domain, int myRank)
-{
-   int ok = 0;
-   
-   /* Create an option list that will give some hints to VisIt for
-    * printing out the cycle and time in the annotations */
-   DBoptlist *optlist;
-
-
-   /* Write out the mesh connectivity in fully unstructured format */
-   int shapetype[1] = {DB_ZONETYPE_HEX};
-   int shapesize[1] = {8};
-   int shapecnt[1] = {domain.numElem()};
-   int *conn = new int[domain.numElem()*8] ;
-   int ci = 0 ;
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      Index_t *elemToNode = domain.nodelist(ei) ;
-      for (int ni=0; ni < 8; ++ni) {
-         conn[ci++] = elemToNode[ni] ;
-      }
-   }
-   ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3,
-                        conn, domain.numElem()*8,
-                        0,0,0, /* Not carrying ghost zones */
-                        shapetype, shapesize, shapecnt,
-                        1, NULL);
-   delete [] conn ;
-
-   /* Write out the mesh coordinates associated with the mesh */
-   const char* coordnames[3] = {"X", "Y", "Z"};
-   float *coords[3] ;
-   coords[0] = new float[domain.numNode()] ;
-   coords[1] = new float[domain.numNode()] ;
-   coords[2] = new float[domain.numNode()] ;
-   for (int ni=0; ni < domain.numNode() ; ++ni) {
-      coords[0][ni] = float(domain.x(ni)) ;
-      coords[1][ni] = float(domain.y(ni)) ;
-      coords[2][ni] = float(domain.z(ni)) ;
-   }
-   optlist = DBMakeOptlist(2);
-   ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time());
-   ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle());
-   ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords,
-                      domain.numNode(), domain.numElem(), "connectivity",
-                      0, DB_FLOAT, optlist);
-   ok += DBFreeOptlist(optlist);
-   delete [] coords[2] ;
-   delete [] coords[1] ;
-   delete [] coords[0] ;
-
-   /* Write out the materials */
-   int *matnums = new int[domain.numReg()];
-   int dims[1] = {domain.numElem()}; // No mixed elements
-   for(int i=0 ; i<domain.numReg() ; ++i)
-      matnums[i] = i+1;
-   
-   ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(),
-                       matnums, domain.regNumList(), dims, 1,
-                       NULL, NULL, NULL, NULL, 0, DB_FLOAT, NULL);
-   delete [] matnums;
-
-   /* Write out pressure, energy, relvol, q */
-
-   float *e = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      e[ei] = float(domain.e(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "e", "mesh", e,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] e ;
-
-
-   float *p = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      p[ei] = float(domain.p(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "p", "mesh", p,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] p ;
-
-   float *v = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      v[ei] = float(domain.v(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "v", "mesh", v,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] v ;
-
-   float *q = new float[domain.numElem()] ; 
-   for (int ei=0; ei < domain.numElem(); ++ei) {
-      q[ei] = float(domain.q(ei)) ;
-   }
-   ok += DBPutUcdvar1(db, "q", "mesh", q,
-                      domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
-                      NULL);
-   delete [] q ;
-
-   /* Write out nodal speed, velocities */
-   float *zd    = new float[domain.numNode()];
-   float *yd    = new float[domain.numNode()];
-   float *xd    = new float[domain.numNode()];
-   float *speed = new float[domain.numNode()];
-   for(int ni=0 ; ni < domain.numNode() ; ++ni) {
-      xd[ni]    = float(domain.xd(ni));
-      yd[ni]    = float(domain.yd(ni));
-      zd[ni]    = float(domain.zd(ni));
-      speed[ni] = float(sqrt((xd[ni]*xd[ni])+(yd[ni]*yd[ni])+(zd[ni]*zd[ni])));
-   }
-
-   ok += DBPutUcdvar1(db, "speed", "mesh", speed,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] speed;
-
-
-   ok += DBPutUcdvar1(db, "xd", "mesh", xd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] xd ;
-
-   ok += DBPutUcdvar1(db, "yd", "mesh", yd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] yd ;
-
-   ok += DBPutUcdvar1(db, "zd", "mesh", zd,
-                      domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
-                      NULL);
-   delete [] zd ;
-
-
-   if (ok != 0) {
-      printf("Error writing out viz file - rank %d\n", myRank);
-   }
-}
-
-/**********************************************************************/
-
-#if USE_MPI     
-void
-   DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
-                         char basename[], int numRanks)
-#else
-void
-  DumpMultiblockObjects(DBfile *db, char basename[], int numRanks)
-#endif
-{
-   /* MULTIBLOCK objects to tie together multiple files */
-  char **multimeshObjs;
-  char **multimatObjs;
-  char ***multivarObjs;
-  int *blockTypes;
-  int *varTypes;
-  int ok = 0;
-  // Make sure this list matches what's written out above
-  char vars[][10] = {"p","e","v","q", "speed", "xd", "yd", "zd"};
-  int numvars = sizeof(vars)/sizeof(vars[0]);
-
-  // Reset to the root directory of the silo file
-  DBSetDir(db, "/");
-
-  // Allocate a bunch of space for building up the string names
-  multimeshObjs = new char*[numRanks];
-  multimatObjs = new char*[numRanks];
-  multivarObjs = new char**[numvars];
-  blockTypes = new int[numRanks];
-  varTypes = new int[numRanks];
-
-  for(int v=0 ; v<numvars ; ++v) {
-     multivarObjs[v] = new char*[numRanks];
-  }
-  
-  for(int i=0 ; i<numRanks ; ++i) {
-     multimeshObjs[i] = new char[64];
-     multimatObjs[i] = new char[64];
-     for(int v=0 ; v<numvars ; ++v) {
-        multivarObjs[v][i] = new char[64];
-     }
-     blockTypes[i] = DB_UCDMESH;
-     varTypes[i] = DB_UCDVAR;
-  }
-      
-  // Build up the multiobject names
-  for(int i=0 ; i<numRanks ; ++i) {
-#if USE_MPI     
-    int iorank = PMPIO_GroupRank(bat, i);
-#else
-    int iorank = 0;
-#endif
-
-    //delete multivarObjs[i];
-    if (iorank == 0) {
-      snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i);
-      snprintf(multimatObjs[i], 64, "/data_%d/regions",i);
-      for(int v=0 ; v<numvars ; ++v) {
-	snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]);
-      }
-     
-    }
-    else {
-      snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh",
-               basename, iorank, i);
-      snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", 
-	       basename, iorank, i);
-      for(int v=0 ; v<numvars ; ++v) {
-         snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", 
-                  basename, iorank, i, vars[v]);
-      }
-    }
-  }
-
-  // Now write out the objects
-  ok += DBPutMultimesh(db, "mesh", numRanks,
-		       (char**)multimeshObjs, blockTypes, NULL);
-  ok += DBPutMultimat(db, "regions", numRanks,
-		      (char**)multimatObjs, NULL);
-  for(int v=0 ; v<numvars ; ++v) {
-     ok += DBPutMultivar(db, vars[v], numRanks,
-                         (char**)multivarObjs[v], varTypes, NULL);
-  }
-
-  for(int v=0; v < numvars; ++v) {
-    for(int i = 0; i < numRanks; i++) {
-      delete multivarObjs[v][i];
-    }
-    delete multivarObjs[v];
-  }
-
-  // Clean up
-  for(int i=0 ; i<numRanks ; i++) {
-    delete multimeshObjs[i];
-    delete multimatObjs[i];
-  }
-  delete [] multimeshObjs;
-  delete [] multimatObjs;
-  delete [] multivarObjs;
-  delete [] blockTypes;
-  delete [] varTypes;
-
-  if (ok != 0) {
-    printf("Error writing out multiXXX objs to viz file - rank 0\n");
-  }
-}
-
-# if USE_MPI
-
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Create(const char *fname,
-		     const char *dname,
-		     void *udata)
-{
-   /* Create the file */
-   DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void *
-LULESH_PMPIO_Open(const char *fname,
-		   const char *dname,
-		   PMPIO_iomode_t ioMode,
-		   void *udata)
-{
-   /* Open the file */
-  DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND);
-
-   /* Put the data in a subdirectory, so VisIt only sees the multimesh
-    * objects we write out in the base file */
-   if (db) {
-     DBMkDir(db, dname);
-     DBSetDir(db, dname);
-   }
-   return (void*)db;
-}
-
-   
-/**********************************************************************/
-
-static void
-LULESH_PMPIO_Close(void *file, void *udata)
-{
-  DBfile *db = (DBfile*)file;
-  if (db)
-    DBClose(db);
-}
-# endif
-
-   
-#else
-
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks)
-{
-   if (myRank == 0) {
-      printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n");
-   }
-}
-
-#endif
-
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.cc b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.cc
deleted file mode 100644
index 76c4a7898..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.cc
+++ /dev/null
@@ -1,2819 +0,0 @@
-/*
-  This is a Version 2.0 MPI + OpenMP implementation of LULESH
-
-                 Copyright (c) 2010-2013.
-      Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-                  LLNL-CODE-461231
-                All rights reserved.
-
-This file is part of LULESH, Version 2.0.
-Please also read this link -- http://www.opensource.org/licenses/index.php
-
-//////////////
-DIFFERENCES BETWEEN THIS VERSION (2.x) AND EARLIER VERSIONS:
-* Addition of regions to make work more representative of multi-material codes
-* Default size of each domain is 30^3 (27000 elem) instead of 45^3. This is
-  more representative of our actual working set sizes
-* Single source distribution supports pure serial, pure OpenMP, MPI-only, 
-  and MPI+OpenMP
-* Addition of ability to visualize the mesh using VisIt 
-  https://wci.llnl.gov/codes/visit/download.html
-* Various command line options (see ./lulesh2.0 -h)
- -q              : quiet mode - suppress stdout
- -i <iterations> : number of cycles to run
- -s <size>       : length of cube mesh along side
- -r <numregions> : Number of distinct regions (def: 11)
- -b <balance>    : Load balance between regions of a domain (def: 1)
- -c <cost>       : Extra cost of more expensive regions (def: 1)
- -f <filepieces> : Number of file parts for viz output (def: np/9)
- -p              : Print out progress
- -v              : Output viz file (requires compiling with -DVIZ_MESH
- -h              : This message
-
- printf("Usage: %s [opts]\n", execname);
-      printf(" where [opts] is one or more of:\n");
-      printf(" -q              : quiet mode - suppress all stdout\n");
-      printf(" -i <iterations> : number of cycles to run\n");
-      printf(" -s <size>       : length of cube mesh along side\n");
-      printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
-      printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
-      printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
-      printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
-      printf(" -p              : Print out progress\n");
-      printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
-      printf(" -h              : This message\n");
-      printf("\n\n");
-
-*Notable changes in LULESH 2.0
-
-* Split functionality into different files
-lulesh.cc - where most (all?) of the timed functionality lies
-lulesh-comm.cc - MPI functionality
-lulesh-init.cc - Setup code
-lulesh-viz.cc  - Support for visualization option
-lulesh-util.cc - Non-timed functions
-*
-* The concept of "regions" was added, although every region is the same ideal
-*    gas material, and the same sedov blast wave problem is still the only
-*    problem its hardcoded to solve.
-* Regions allow two things important to making this proxy app more representative:
-*   Four of the LULESH routines are now performed on a region-by-region basis,
-*     making the memory access patterns non-unit stride
-*   Artificial load imbalances can be easily introduced that could impact
-*     parallelization strategies.  
-* The load balance flag changes region assignment.  Region number is raised to
-*   the power entered for assignment probability.  Most likely regions changes
-*   with MPI process id.
-* The cost flag raises the cost of ~45% of the regions to evaluate EOS by the
-*   entered multiple. The cost of 5% is 10x the entered multiple.
-* MPI and OpenMP were added, and coalesced into a single version of the source
-*   that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
-* Added support to write plot files using "poor mans parallel I/O" when linked
-*   with the silo library, which in turn can be read by VisIt.
-* Enabled variable timestep calculation by default (courant condition), which
-*   results in an additional reduction.
-* Default domain (mesh) size reduced from 45^3 to 30^3
-* Command line options to allow numerous test cases without needing to recompile
-* Performance optimizations and code cleanup beyond LULESH 1.0
-* Added a "Figure of Merit" calculation (elements solved per microsecond) and
-*   output in support of using LULESH 2.0 for the 2017 CORAL procurement
-*
-* Possible Differences in Final Release (other changes possible)
-*
-* High Level mesh structure to allow data structure transformations
-* Different default parameters
-* Minor code performance changes and cleanup
-
-TODO in future versions
-* Add reader for (truly) unstructured meshes, probably serial only
-* CMake based build system
-
-//////////////
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the disclaimer below.
-
-   * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the disclaimer (as noted below)
-     in the documentation and/or other materials provided with the
-     distribution.
-
-   * Neither the name of the LLNS/LLNL nor the names of its contributors
-     may be used to endorse or promote products derived from this software
-     without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,
-THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-   Department of Energy (DOE). This work was produced at Lawrence Livermore
-   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-   Security, LLC nor any of their employees, makes any warranty, express
-   or implied, or assumes any liability or responsibility for the accuracy,
-   completeness, or usefulness of any information, apparatus, product, or
-   process disclosed, or represents that its use would not infringe
-   privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-   services by trade name, trademark, manufacturer or otherwise does not
-   necessarily constitute or imply its endorsement, recommendation, or
-   favoring by the United States Government or Lawrence Livermore National
-   Security, LLC. The views and opinions of authors expressed herein do not
-   necessarily state or reflect those of the United States Government or
-   Lawrence Livermore National Security, LLC, and shall not be used for
-   advertising or product endorsement purposes.
-
-*/
-
-#include <climits>
-#include <vector>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
-#include <unistd.h>
-
-#if USE_OMP
-# include <omp.h>
-#endif
-
-#include "lulesh.h"
-
-#include "Timer.hxx"
-
-/******************************************/
-
-/* Work Routines */
-
-static inline
-void TimeIncrement(Domain& domain)
-{
-   Real_t targetdt = domain.stoptime() - domain.time() ;
-
-   if ((domain.dtfixed() <= Real_t(0.0)) && (domain.cycle() != Int_t(0))) {
-      Real_t ratio ;
-      Real_t olddt = domain.deltatime() ;
-
-      /* This will require a reduction in parallel */
-      Real_t gnewdt = Real_t(1.0e+20) ;
-      Real_t newdt ;
-      if (domain.dtcourant() < gnewdt) {
-         gnewdt = domain.dtcourant() / Real_t(2.0) ;
-      }
-      if (domain.dthydro() < gnewdt) {
-         gnewdt = domain.dthydro() * Real_t(2.0) / Real_t(3.0) ;
-      }
-
-#if USE_MPI      
-      MPI_Allreduce(&gnewdt, &newdt, 1,
-                    ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
-#else
-      newdt = gnewdt;
-#endif
-      
-      ratio = newdt / olddt ;
-      if (ratio >= Real_t(1.0)) {
-         if (ratio < domain.deltatimemultlb()) {
-            newdt = olddt ;
-         }
-         else if (ratio > domain.deltatimemultub()) {
-            newdt = olddt*domain.deltatimemultub() ;
-         }
-      }
-
-      if (newdt > domain.dtmax()) {
-         newdt = domain.dtmax() ;
-      }
-      domain.deltatime() = newdt ;
-   }
-
-   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */
-   if ((targetdt > domain.deltatime()) &&
-       (targetdt < (Real_t(4.0) * domain.deltatime() / Real_t(3.0))) ) {
-      targetdt = Real_t(2.0) * domain.deltatime() / Real_t(3.0) ;
-   }
-
-   if (targetdt < domain.deltatime()) {
-      domain.deltatime() = targetdt ;
-   }
-
-   domain.time() += domain.deltatime() ;
-
-   ++domain.cycle() ;
-}
-
-/******************************************/
-
-static inline
-void CollectDomainNodesToElemNodes(Domain &domain,
-                                   const Index_t* elemToNode,
-                                   Real_t elemX[8],
-                                   Real_t elemY[8],
-                                   Real_t elemZ[8])
-{
-   Index_t nd0i = elemToNode[0] ;
-   Index_t nd1i = elemToNode[1] ;
-   Index_t nd2i = elemToNode[2] ;
-   Index_t nd3i = elemToNode[3] ;
-   Index_t nd4i = elemToNode[4] ;
-   Index_t nd5i = elemToNode[5] ;
-   Index_t nd6i = elemToNode[6] ;
-   Index_t nd7i = elemToNode[7] ;
-
-   elemX[0] = domain.x(nd0i);
-   elemX[1] = domain.x(nd1i);
-   elemX[2] = domain.x(nd2i);
-   elemX[3] = domain.x(nd3i);
-   elemX[4] = domain.x(nd4i);
-   elemX[5] = domain.x(nd5i);
-   elemX[6] = domain.x(nd6i);
-   elemX[7] = domain.x(nd7i);
-
-   elemY[0] = domain.y(nd0i);
-   elemY[1] = domain.y(nd1i);
-   elemY[2] = domain.y(nd2i);
-   elemY[3] = domain.y(nd3i);
-   elemY[4] = domain.y(nd4i);
-   elemY[5] = domain.y(nd5i);
-   elemY[6] = domain.y(nd6i);
-   elemY[7] = domain.y(nd7i);
-
-   elemZ[0] = domain.z(nd0i);
-   elemZ[1] = domain.z(nd1i);
-   elemZ[2] = domain.z(nd2i);
-   elemZ[3] = domain.z(nd3i);
-   elemZ[4] = domain.z(nd4i);
-   elemZ[5] = domain.z(nd5i);
-   elemZ[6] = domain.z(nd6i);
-   elemZ[7] = domain.z(nd7i);
-
-}
-
-/******************************************/
-
-static inline
-void InitStressTermsForElems(Domain &domain,
-                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                             Index_t numElem)
-{
-   //
-   // pull in the stresses appropriate to the hydro integration
-   //
-
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i){
-      sigxx[i] = sigyy[i] = sigzz[i] =  - domain.p(i) - domain.q(i) ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcElemShapeFunctionDerivatives( Real_t const x[],
-                                       Real_t const y[],
-                                       Real_t const z[],
-                                       Real_t b[][8],
-                                       Real_t* const volume )
-{
-  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;
-  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;
-  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;
-  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;
-
-  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;
-  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;
-  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;
-  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;
-
-  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;
-  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;
-  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;
-  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;
-
-  Real_t fjxxi, fjxet, fjxze;
-  Real_t fjyxi, fjyet, fjyze;
-  Real_t fjzxi, fjzet, fjzze;
-  Real_t cjxxi, cjxet, cjxze;
-  Real_t cjyxi, cjyet, cjyze;
-  Real_t cjzxi, cjzet, cjzze;
-
-  fjxxi = Real_t(.125) * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );
-  fjxet = Real_t(.125) * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );
-  fjxze = Real_t(.125) * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );
-
-  fjyxi = Real_t(.125) * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );
-  fjyet = Real_t(.125) * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );
-  fjyze = Real_t(.125) * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );
-
-  fjzxi = Real_t(.125) * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );
-  fjzet = Real_t(.125) * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );
-  fjzze = Real_t(.125) * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );
-
-  /* compute cofactors */
-  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);
-  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);
-  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);
-
-  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);
-  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);
-  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);
-
-  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);
-  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);
-  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);
-
-  /* calculate partials :
-     this need only be done for l = 0,1,2,3   since , by symmetry ,
-     (6,7,4,5) = - (0,1,2,3) .
-  */
-  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;
-  b[0][1] =      cjxxi  -  cjxet  -  cjxze;
-  b[0][2] =      cjxxi  +  cjxet  -  cjxze;
-  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;
-  b[0][4] = -b[0][2];
-  b[0][5] = -b[0][3];
-  b[0][6] = -b[0][0];
-  b[0][7] = -b[0][1];
-
-  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;
-  b[1][1] =      cjyxi  -  cjyet  -  cjyze;
-  b[1][2] =      cjyxi  +  cjyet  -  cjyze;
-  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;
-  b[1][4] = -b[1][2];
-  b[1][5] = -b[1][3];
-  b[1][6] = -b[1][0];
-  b[1][7] = -b[1][1];
-
-  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;
-  b[2][1] =      cjzxi  -  cjzet  -  cjzze;
-  b[2][2] =      cjzxi  +  cjzet  -  cjzze;
-  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;
-  b[2][4] = -b[2][2];
-  b[2][5] = -b[2][3];
-  b[2][6] = -b[2][0];
-  b[2][7] = -b[2][1];
-
-  /* calculate jacobian determinant (volume) */
-  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);
-}
-
-/******************************************/
-
-static inline
-void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,
-                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,
-                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,
-                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,
-                       const Real_t x0, const Real_t y0, const Real_t z0,
-                       const Real_t x1, const Real_t y1, const Real_t z1,
-                       const Real_t x2, const Real_t y2, const Real_t z2,
-                       const Real_t x3, const Real_t y3, const Real_t z3)
-{
-   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);
-   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);
-   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);
-   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);
-   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);
-   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);
-   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);
-   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);
-   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);
-
-   *normalX0 += areaX;
-   *normalX1 += areaX;
-   *normalX2 += areaX;
-   *normalX3 += areaX;
-
-   *normalY0 += areaY;
-   *normalY1 += areaY;
-   *normalY2 += areaY;
-   *normalY3 += areaY;
-
-   *normalZ0 += areaZ;
-   *normalZ1 += areaZ;
-   *normalZ2 += areaZ;
-   *normalZ3 += areaZ;
-}
-
-/******************************************/
-
-static inline
-void CalcElemNodeNormals(Real_t pfx[8],
-                         Real_t pfy[8],
-                         Real_t pfz[8],
-                         const Real_t x[8],
-                         const Real_t y[8],
-                         const Real_t z[8])
-{
-   for (Index_t i = 0 ; i < 8 ; ++i) {
-      pfx[i] = Real_t(0.0);
-      pfy[i] = Real_t(0.0);
-      pfz[i] = Real_t(0.0);
-   }
-   /* evaluate face one: nodes 0, 1, 2, 3 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[0], y[0], z[0], x[1], y[1], z[1],
-                  x[2], y[2], z[2], x[3], y[3], z[3]);
-   /* evaluate face two: nodes 0, 4, 5, 1 */
-   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[1], &pfy[1], &pfz[1],
-                  x[0], y[0], z[0], x[4], y[4], z[4],
-                  x[5], y[5], z[5], x[1], y[1], z[1]);
-   /* evaluate face three: nodes 1, 5, 6, 2 */
-   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[2], &pfy[2], &pfz[2],
-                  x[1], y[1], z[1], x[5], y[5], z[5],
-                  x[6], y[6], z[6], x[2], y[2], z[2]);
-   /* evaluate face four: nodes 2, 6, 7, 3 */
-   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[3], &pfy[3], &pfz[3],
-                  x[2], y[2], z[2], x[6], y[6], z[6],
-                  x[7], y[7], z[7], x[3], y[3], z[3]);
-   /* evaluate face five: nodes 3, 7, 4, 0 */
-   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[4], &pfy[4], &pfz[4],
-                  &pfx[0], &pfy[0], &pfz[0],
-                  x[3], y[3], z[3], x[7], y[7], z[7],
-                  x[4], y[4], z[4], x[0], y[0], z[0]);
-   /* evaluate face six: nodes 4, 7, 6, 5 */
-   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],
-                  &pfx[7], &pfy[7], &pfz[7],
-                  &pfx[6], &pfy[6], &pfz[6],
-                  &pfx[5], &pfy[5], &pfz[5],
-                  x[4], y[4], z[4], x[7], y[7], z[7],
-                  x[6], y[6], z[6], x[5], y[5], z[5]);
-}
-
-/******************************************/
-
-static inline
-void SumElemStressesToNodeForces( const Real_t B[][8],
-                                  const Real_t stress_xx,
-                                  const Real_t stress_yy,
-                                  const Real_t stress_zz,
-                                  Real_t fx[], Real_t fy[], Real_t fz[] )
-{
-   for(Index_t i = 0; i < 8; i++) {
-      fx[i] = -( stress_xx * B[0][i] );
-      fy[i] = -( stress_yy * B[1][i]  );
-      fz[i] = -( stress_zz * B[2][i] );
-   }
-}
-
-/******************************************/
-
-static inline
-void IntegrateStressForElems( Domain &domain,
-                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,
-                              Real_t *determ, Index_t numElem, Index_t numNode)
-{
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *fx_elem;
-   Real_t *fy_elem;
-   Real_t *fz_elem;
-   Real_t fx_local[8] ;
-   Real_t fy_local[8] ;
-   Real_t fz_local[8] ;
-
-
-  if (numthreads > 1) {
-     fx_elem = Allocate<Real_t>(numElem8) ;
-     fy_elem = Allocate<Real_t>(numElem8) ;
-     fz_elem = Allocate<Real_t>(numElem8) ;
-  }
-  // loop over all elements
-
-#pragma omp parallel for firstprivate(numElem)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    const Index_t* const elemToNode = domain.nodelist(k);
-    Real_t B[3][8] ;// shape function derivatives
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // Volume calculation involves extra work for numerical consistency
-    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,
-                                         B, &determ[k]);
-
-    CalcElemNodeNormals( B[0] , B[1], B[2],
-                          x_local, y_local, z_local );
-
-    if (numthreads > 1) {
-       // Eliminate thread writing conflicts at the nodes by giving
-       // each element its own copy to write to
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    &fx_elem[k*8],
-                                    &fy_elem[k*8],
-                                    &fz_elem[k*8] ) ;
-    }
-    else {
-       SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],
-                                    fx_local, fy_local, fz_local ) ;
-
-       // copy nodal force contributions to global force arrray.
-       for( Index_t lnode=0 ; lnode<8 ; ++lnode ) {
-          Index_t gnode = elemToNode[lnode];
-          domain.fx(gnode) += fx_local[lnode];
-          domain.fy(gnode) += fy_local[lnode];
-          domain.fz(gnode) += fz_local[lnode];
-       }
-    }
-  }
-
-  if (numthreads > 1) {
-     // If threaded, then we need to copy the data out of the temporary
-     // arrays used above into the final forces field
-#pragma omp parallel for firstprivate(numNode)
-     for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-     {
-        Index_t count = domain.nodeElemCount(gnode) ;
-        Index_t *cornerList = domain.nodeElemCornerList(gnode) ;
-        Real_t fx_tmp = Real_t(0.0) ;
-        Real_t fy_tmp = Real_t(0.0) ;
-        Real_t fz_tmp = Real_t(0.0) ;
-        for (Index_t i=0 ; i < count ; ++i) {
-           Index_t ielem = cornerList[i] ;
-           fx_tmp += fx_elem[ielem] ;
-           fy_tmp += fy_elem[ielem] ;
-           fz_tmp += fz_elem[ielem] ;
-        }
-        domain.fx(gnode) = fx_tmp ;
-        domain.fy(gnode) = fy_tmp ;
-        domain.fz(gnode) = fz_tmp ;
-     }
-     Release(&fz_elem) ;
-     Release(&fy_elem) ;
-     Release(&fx_elem) ;
-  }
-}
-
-/******************************************/
-
-static inline
-void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,
-             const Real_t x3, const Real_t x4, const Real_t x5,
-             const Real_t y0, const Real_t y1, const Real_t y2,
-             const Real_t y3, const Real_t y4, const Real_t y5,
-             const Real_t z0, const Real_t z1, const Real_t z2,
-             const Real_t z3, const Real_t z4, const Real_t z5,
-             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)
-{
-   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;
-
-   *dvdx =
-      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +
-      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -
-      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);
-   *dvdy =
-      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -
-      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +
-      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);
-
-   *dvdz =
-      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -
-      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +
-      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);
-
-   *dvdx *= twelfth;
-   *dvdy *= twelfth;
-   *dvdz *= twelfth;
-}
-
-/******************************************/
-
-static inline
-void CalcElemVolumeDerivative(Real_t dvdx[8],
-                              Real_t dvdy[8],
-                              Real_t dvdz[8],
-                              const Real_t x[8],
-                              const Real_t y[8],
-                              const Real_t z[8])
-{
-   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],
-           y[1], y[2], y[3], y[4], y[5], y[7],
-           z[1], z[2], z[3], z[4], z[5], z[7],
-           &dvdx[0], &dvdy[0], &dvdz[0]);
-   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],
-           y[0], y[1], y[2], y[7], y[4], y[6],
-           z[0], z[1], z[2], z[7], z[4], z[6],
-           &dvdx[3], &dvdy[3], &dvdz[3]);
-   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],
-           y[3], y[0], y[1], y[6], y[7], y[5],
-           z[3], z[0], z[1], z[6], z[7], z[5],
-           &dvdx[2], &dvdy[2], &dvdz[2]);
-   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],
-           y[2], y[3], y[0], y[5], y[6], y[4],
-           z[2], z[3], z[0], z[5], z[6], z[4],
-           &dvdx[1], &dvdy[1], &dvdz[1]);
-   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],
-           y[7], y[6], y[5], y[0], y[3], y[1],
-           z[7], z[6], z[5], z[0], z[3], z[1],
-           &dvdx[4], &dvdy[4], &dvdz[4]);
-   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],
-           y[4], y[7], y[6], y[1], y[0], y[2],
-           z[4], z[7], z[6], z[1], z[0], z[2],
-           &dvdx[5], &dvdy[5], &dvdz[5]);
-   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],
-           y[5], y[4], y[7], y[2], y[1], y[3],
-           z[5], z[4], z[7], z[2], z[1], z[3],
-           &dvdx[6], &dvdy[6], &dvdz[6]);
-   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],
-           y[6], y[5], y[4], y[3], y[2], y[0],
-           z[6], z[5], z[4], z[3], z[2], z[0],
-           &dvdx[7], &dvdy[7], &dvdz[7]);
-}
-
-/******************************************/
-
-static inline
-void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t hourgam[][4],
-                              Real_t coefficient,
-                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )
-{
-   Real_t hxx[4];
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * xd[0] + hourgam[1][i] * xd[1] +
-               hourgam[2][i] * xd[2] + hourgam[3][i] * xd[3] +
-               hourgam[4][i] * xd[4] + hourgam[5][i] * xd[5] +
-               hourgam[6][i] * xd[6] + hourgam[7][i] * xd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfx[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * yd[0] + hourgam[1][i] * yd[1] +
-               hourgam[2][i] * yd[2] + hourgam[3][i] * yd[3] +
-               hourgam[4][i] * yd[4] + hourgam[5][i] * yd[5] +
-               hourgam[6][i] * yd[6] + hourgam[7][i] * yd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfy[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-   for(Index_t i = 0; i < 4; i++) {
-      hxx[i] = hourgam[0][i] * zd[0] + hourgam[1][i] * zd[1] +
-               hourgam[2][i] * zd[2] + hourgam[3][i] * zd[3] +
-               hourgam[4][i] * zd[4] + hourgam[5][i] * zd[5] +
-               hourgam[6][i] * zd[6] + hourgam[7][i] * zd[7];
-   }
-   for(Index_t i = 0; i < 8; i++) {
-      hgfz[i] = coefficient *
-                (hourgam[i][0] * hxx[0] + hourgam[i][1] * hxx[1] +
-                 hourgam[i][2] * hxx[2] + hourgam[i][3] * hxx[3]);
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcFBHourglassForceForElems( Domain &domain,
-                                   Real_t *determ,
-                                   Real_t *x8n, Real_t *y8n, Real_t *z8n,
-                                   Real_t *dvdx, Real_t *dvdy, Real_t *dvdz,
-                                   Real_t hourg, Index_t numElem,
-                                   Index_t numNode)
-{
-
-#if USE_OMP
-   Index_t numthreads = omp_get_max_threads();
-#else
-   Index_t numthreads = 1;
-#endif
-   /*************************************************
-    *
-    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass
-    *               force.
-    *
-    *************************************************/
-  
-   Index_t numElem8 = numElem * 8 ;
-
-   Real_t *fx_elem; 
-   Real_t *fy_elem; 
-   Real_t *fz_elem; 
-
-   if(numthreads > 1) {
-      fx_elem = Allocate<Real_t>(numElem8) ;
-      fy_elem = Allocate<Real_t>(numElem8) ;
-      fz_elem = Allocate<Real_t>(numElem8) ;
-   }
-
-   Real_t  gamma[4][8];
-
-   gamma[0][0] = Real_t( 1.);
-   gamma[0][1] = Real_t( 1.);
-   gamma[0][2] = Real_t(-1.);
-   gamma[0][3] = Real_t(-1.);
-   gamma[0][4] = Real_t(-1.);
-   gamma[0][5] = Real_t(-1.);
-   gamma[0][6] = Real_t( 1.);
-   gamma[0][7] = Real_t( 1.);
-   gamma[1][0] = Real_t( 1.);
-   gamma[1][1] = Real_t(-1.);
-   gamma[1][2] = Real_t(-1.);
-   gamma[1][3] = Real_t( 1.);
-   gamma[1][4] = Real_t(-1.);
-   gamma[1][5] = Real_t( 1.);
-   gamma[1][6] = Real_t( 1.);
-   gamma[1][7] = Real_t(-1.);
-   gamma[2][0] = Real_t( 1.);
-   gamma[2][1] = Real_t(-1.);
-   gamma[2][2] = Real_t( 1.);
-   gamma[2][3] = Real_t(-1.);
-   gamma[2][4] = Real_t( 1.);
-   gamma[2][5] = Real_t(-1.);
-   gamma[2][6] = Real_t( 1.);
-   gamma[2][7] = Real_t(-1.);
-   gamma[3][0] = Real_t(-1.);
-   gamma[3][1] = Real_t( 1.);
-   gamma[3][2] = Real_t(-1.);
-   gamma[3][3] = Real_t( 1.);
-   gamma[3][4] = Real_t( 1.);
-   gamma[3][5] = Real_t(-1.);
-   gamma[3][6] = Real_t( 1.);
-   gamma[3][7] = Real_t(-1.);
-
-/*************************************************/
-/*    compute the hourglass modes */
-
-
-#pragma omp parallel for firstprivate(numElem, hourg)
-   for(Index_t i2=0;i2<numElem;++i2){
-      Real_t *fx_local, *fy_local, *fz_local ;
-      Real_t hgfx[8], hgfy[8], hgfz[8] ;
-
-      Real_t coefficient;
-
-      Real_t hourgam[8][4];
-      Real_t xd1[8], yd1[8], zd1[8] ;
-
-      const Index_t *elemToNode = domain.nodelist(i2);
-      Index_t i3=8*i2;
-      Real_t volinv=Real_t(1.0)/determ[i2];
-      Real_t ss1, mass1, volume13 ;
-      for(Index_t i1=0;i1<4;++i1){
-
-         Real_t hourmodx =
-            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +
-            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +
-            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +
-            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmody =
-            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +
-            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +
-            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +
-            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];
-
-         Real_t hourmodz =
-            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +
-            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +
-            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +
-            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];
-
-         hourgam[0][i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +
-                                                  dvdy[i3  ] * hourmody +
-                                                  dvdz[i3  ] * hourmodz );
-
-         hourgam[1][i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +
-                                                  dvdy[i3+1] * hourmody +
-                                                  dvdz[i3+1] * hourmodz );
-
-         hourgam[2][i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +
-                                                  dvdy[i3+2] * hourmody +
-                                                  dvdz[i3+2] * hourmodz );
-
-         hourgam[3][i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +
-                                                  dvdy[i3+3] * hourmody +
-                                                  dvdz[i3+3] * hourmodz );
-
-         hourgam[4][i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +
-                                                  dvdy[i3+4] * hourmody +
-                                                  dvdz[i3+4] * hourmodz );
-
-         hourgam[5][i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +
-                                                  dvdy[i3+5] * hourmody +
-                                                  dvdz[i3+5] * hourmodz );
-
-         hourgam[6][i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +
-                                                  dvdy[i3+6] * hourmody +
-                                                  dvdz[i3+6] * hourmodz );
-
-         hourgam[7][i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +
-                                                  dvdy[i3+7] * hourmody +
-                                                  dvdz[i3+7] * hourmodz );
-
-      }
-
-      /* compute forces */
-      /* store forces into h arrays (force arrays) */
-
-      ss1=domain.ss(i2);
-      mass1=domain.elemMass(i2);
-      volume13=CBRT(determ[i2]);
-
-      Index_t n0si2 = elemToNode[0];
-      Index_t n1si2 = elemToNode[1];
-      Index_t n2si2 = elemToNode[2];
-      Index_t n3si2 = elemToNode[3];
-      Index_t n4si2 = elemToNode[4];
-      Index_t n5si2 = elemToNode[5];
-      Index_t n6si2 = elemToNode[6];
-      Index_t n7si2 = elemToNode[7];
-
-      xd1[0] = domain.xd(n0si2);
-      xd1[1] = domain.xd(n1si2);
-      xd1[2] = domain.xd(n2si2);
-      xd1[3] = domain.xd(n3si2);
-      xd1[4] = domain.xd(n4si2);
-      xd1[5] = domain.xd(n5si2);
-      xd1[6] = domain.xd(n6si2);
-      xd1[7] = domain.xd(n7si2);
-
-      yd1[0] = domain.yd(n0si2);
-      yd1[1] = domain.yd(n1si2);
-      yd1[2] = domain.yd(n2si2);
-      yd1[3] = domain.yd(n3si2);
-      yd1[4] = domain.yd(n4si2);
-      yd1[5] = domain.yd(n5si2);
-      yd1[6] = domain.yd(n6si2);
-      yd1[7] = domain.yd(n7si2);
-
-      zd1[0] = domain.zd(n0si2);
-      zd1[1] = domain.zd(n1si2);
-      zd1[2] = domain.zd(n2si2);
-      zd1[3] = domain.zd(n3si2);
-      zd1[4] = domain.zd(n4si2);
-      zd1[5] = domain.zd(n5si2);
-      zd1[6] = domain.zd(n6si2);
-      zd1[7] = domain.zd(n7si2);
-
-      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;
-
-      CalcElemFBHourglassForce(xd1,yd1,zd1,
-                      hourgam,
-                      coefficient, hgfx, hgfy, hgfz);
-
-      // With the threaded version, we write into local arrays per elem
-      // so we don't have to worry about race conditions
-      if (numthreads > 1) {
-         fx_local = &fx_elem[i3] ;
-         fx_local[0] = hgfx[0];
-         fx_local[1] = hgfx[1];
-         fx_local[2] = hgfx[2];
-         fx_local[3] = hgfx[3];
-         fx_local[4] = hgfx[4];
-         fx_local[5] = hgfx[5];
-         fx_local[6] = hgfx[6];
-         fx_local[7] = hgfx[7];
-
-         fy_local = &fy_elem[i3] ;
-         fy_local[0] = hgfy[0];
-         fy_local[1] = hgfy[1];
-         fy_local[2] = hgfy[2];
-         fy_local[3] = hgfy[3];
-         fy_local[4] = hgfy[4];
-         fy_local[5] = hgfy[5];
-         fy_local[6] = hgfy[6];
-         fy_local[7] = hgfy[7];
-
-         fz_local = &fz_elem[i3] ;
-         fz_local[0] = hgfz[0];
-         fz_local[1] = hgfz[1];
-         fz_local[2] = hgfz[2];
-         fz_local[3] = hgfz[3];
-         fz_local[4] = hgfz[4];
-         fz_local[5] = hgfz[5];
-         fz_local[6] = hgfz[6];
-         fz_local[7] = hgfz[7];
-      }
-      else {
-         domain.fx(n0si2) += hgfx[0];
-         domain.fy(n0si2) += hgfy[0];
-         domain.fz(n0si2) += hgfz[0];
-
-         domain.fx(n1si2) += hgfx[1];
-         domain.fy(n1si2) += hgfy[1];
-         domain.fz(n1si2) += hgfz[1];
-
-         domain.fx(n2si2) += hgfx[2];
-         domain.fy(n2si2) += hgfy[2];
-         domain.fz(n2si2) += hgfz[2];
-
-         domain.fx(n3si2) += hgfx[3];
-         domain.fy(n3si2) += hgfy[3];
-         domain.fz(n3si2) += hgfz[3];
-
-         domain.fx(n4si2) += hgfx[4];
-         domain.fy(n4si2) += hgfy[4];
-         domain.fz(n4si2) += hgfz[4];
-
-         domain.fx(n5si2) += hgfx[5];
-         domain.fy(n5si2) += hgfy[5];
-         domain.fz(n5si2) += hgfz[5];
-
-         domain.fx(n6si2) += hgfx[6];
-         domain.fy(n6si2) += hgfy[6];
-         domain.fz(n6si2) += hgfz[6];
-
-         domain.fx(n7si2) += hgfx[7];
-         domain.fy(n7si2) += hgfy[7];
-         domain.fz(n7si2) += hgfz[7];
-      }
-   }
-
-   if (numthreads > 1) {
-     // Collect the data from the local arrays into the final force arrays
-#pragma omp parallel for firstprivate(numNode)
-      for( Index_t gnode=0 ; gnode<numNode ; ++gnode )
-      {
-         Index_t count = domain.nodeElemCount(gnode) ;
-         Index_t *cornerList = domain.nodeElemCornerList(gnode) ;
-         Real_t fx_tmp = Real_t(0.0) ;
-         Real_t fy_tmp = Real_t(0.0) ;
-         Real_t fz_tmp = Real_t(0.0) ;
-         for (Index_t i=0 ; i < count ; ++i) {
-            Index_t ielem = cornerList[i] ;
-            fx_tmp += fx_elem[ielem] ;
-            fy_tmp += fy_elem[ielem] ;
-            fz_tmp += fz_elem[ielem] ;
-         }
-         domain.fx(gnode) += fx_tmp ;
-         domain.fy(gnode) += fy_tmp ;
-         domain.fz(gnode) += fz_tmp ;
-      }
-      Release(&fz_elem) ;
-      Release(&fy_elem) ;
-      Release(&fx_elem) ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcHourglassControlForElems(Domain& domain,
-                                  Real_t determ[], Real_t hgcoef)
-{
-   Index_t numElem = domain.numElem() ;
-   Index_t numElem8 = numElem * 8 ;
-   Real_t *dvdx = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdy = Allocate<Real_t>(numElem8) ;
-   Real_t *dvdz = Allocate<Real_t>(numElem8) ;
-   Real_t *x8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *y8n  = Allocate<Real_t>(numElem8) ;
-   Real_t *z8n  = Allocate<Real_t>(numElem8) ;
-
-   /* start loop over elements */
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i=0 ; i<numElem ; ++i){
-      Real_t  x1[8],  y1[8],  z1[8] ;
-      Real_t pfx[8], pfy[8], pfz[8] ;
-
-      Index_t* elemToNode = domain.nodelist(i);
-      CollectDomainNodesToElemNodes(domain, elemToNode, x1, y1, z1);
-
-      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);
-
-      /* load into temporary storage for FB Hour Glass control */
-      for(Index_t ii=0;ii<8;++ii){
-         Index_t jj=8*i+ii;
-
-         dvdx[jj] = pfx[ii];
-         dvdy[jj] = pfy[ii];
-         dvdz[jj] = pfz[ii];
-         x8n[jj]  = x1[ii];
-         y8n[jj]  = y1[ii];
-         z8n[jj]  = z1[ii];
-      }
-
-      determ[i] = domain.volo(i) * domain.v(i);
-
-      /* Do a check for negative volumes */
-      if ( domain.v(i) <= Real_t(0.0) ) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-         exit(VolumeError);
-#endif
-      }
-   }
-
-   if ( hgcoef > Real_t(0.) ) {
-      CalcFBHourglassForceForElems( domain,
-                                    determ, x8n, y8n, z8n, dvdx, dvdy, dvdz,
-                                    hgcoef, numElem, domain.numNode()) ;
-   }
-
-   Release(&z8n) ;
-   Release(&y8n) ;
-   Release(&x8n) ;
-   Release(&dvdz) ;
-   Release(&dvdy) ;
-   Release(&dvdx) ;
-
-   return ;
-}
-
-/******************************************/
-
-static inline
-void CalcVolumeForceForElems(Domain& domain)
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem != 0) {
-      Real_t  hgcoef = domain.hgcoef() ;
-      Real_t *sigxx  = Allocate<Real_t>(numElem) ;
-      Real_t *sigyy  = Allocate<Real_t>(numElem) ;
-      Real_t *sigzz  = Allocate<Real_t>(numElem) ;
-      Real_t *determ = Allocate<Real_t>(numElem) ;
-
-      /* Sum contributions to total stress tensor */
-      InitStressTermsForElems(domain, sigxx, sigyy, sigzz, numElem);
-
-      // call elemlib stress integration loop to produce nodal forces from
-      // material stresses.
-      IntegrateStressForElems( domain,
-                               sigxx, sigyy, sigzz, determ, numElem,
-                               domain.numNode()) ;
-
-      // check for negative element volume
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k ) {
-         if (determ[k] <= Real_t(0.0)) {
-#if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-            exit(VolumeError);
-#endif
-         }
-      }
-
-      CalcHourglassControlForElems(domain, determ, hgcoef) ;
-
-      Release(&determ) ;
-      Release(&sigzz) ;
-      Release(&sigyy) ;
-      Release(&sigxx) ;
-   }
-}
-
-/******************************************/
-
-static inline void CalcForceForNodes(Domain& domain)
-{
-  Index_t numNode = domain.numNode() ;
-
-#if USE_MPI  
-  CommRecv(domain, MSG_COMM_SBN, 3,
-           domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
-           true, false) ;
-#endif  
-
-#pragma omp parallel for firstprivate(numNode)
-  for (Index_t i=0; i<numNode; ++i) {
-     domain.fx(i) = Real_t(0.0) ;
-     domain.fy(i) = Real_t(0.0) ;
-     domain.fz(i) = Real_t(0.0) ;
-  }
-
-  /* Calcforce calls partial, force, hourq */
-  CalcVolumeForceForElems(domain) ;
-
-#if USE_MPI  
-  Domain_member fieldData[3] ;
-  fieldData[0] = &Domain::fx ;
-  fieldData[1] = &Domain::fy ;
-  fieldData[2] = &Domain::fz ;
-  
-  CommSend(domain, MSG_COMM_SBN, 3, fieldData,
-           domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() +  1,
-           true, false) ;
-  CommSBN(domain, 3, fieldData) ;
-#endif  
-}
-
-/******************************************/
-
-static inline
-void CalcAccelerationForNodes(Domain &domain, Index_t numNode)
-{
-   
-#pragma omp parallel for firstprivate(numNode)
-   for (Index_t i = 0; i < numNode; ++i) {
-      domain.xdd(i) = domain.fx(i) / domain.nodalMass(i);
-      domain.ydd(i) = domain.fy(i) / domain.nodalMass(i);
-      domain.zdd(i) = domain.fz(i) / domain.nodalMass(i);
-   }
-}
-
-/******************************************/
-
-static inline
-void ApplyAccelerationBoundaryConditionsForNodes(Domain& domain)
-{
-   Index_t size = domain.sizeX();
-   Index_t numNodeBC = (size+1)*(size+1) ;
-
-#pragma omp parallel
-   {
-      if (!domain.symmXempty() != 0) {
-#pragma omp for nowait firstprivate(numNodeBC)
-         for(Index_t i=0 ; i<numNodeBC ; ++i)
-            domain.xdd(domain.symmX(i)) = Real_t(0.0) ;
-      }
-
-      if (!domain.symmYempty() != 0) {
-#pragma omp for nowait firstprivate(numNodeBC)
-         for(Index_t i=0 ; i<numNodeBC ; ++i)
-            domain.ydd(domain.symmY(i)) = Real_t(0.0) ;
-      }
-
-      if (!domain.symmZempty() != 0) {
-#pragma omp for nowait firstprivate(numNodeBC)
-         for(Index_t i=0 ; i<numNodeBC ; ++i)
-            domain.zdd(domain.symmZ(i)) = Real_t(0.0) ;
-      }
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcVelocityForNodes(Domain &domain, const Real_t dt, const Real_t u_cut,
-                          Index_t numNode)
-{
-
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     Real_t xdtmp, ydtmp, zdtmp ;
-
-     xdtmp = domain.xd(i) + domain.xdd(i) * dt ;
-     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);
-     domain.xd(i) = xdtmp ;
-
-     ydtmp = domain.yd(i) + domain.ydd(i) * dt ;
-     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);
-     domain.yd(i) = ydtmp ;
-
-     zdtmp = domain.zd(i) + domain.zdd(i) * dt ;
-     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);
-     domain.zd(i) = zdtmp ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcPositionForNodes(Domain &domain, const Real_t dt, Index_t numNode)
-{
-#pragma omp parallel for firstprivate(numNode)
-   for ( Index_t i = 0 ; i < numNode ; ++i )
-   {
-     domain.x(i) += domain.xd(i) * dt ;
-     domain.y(i) += domain.yd(i) * dt ;
-     domain.z(i) += domain.zd(i) * dt ;
-   }
-}
-
-/******************************************/
-
-static inline
-void LagrangeNodal(Domain& domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   Domain_member fieldData[6] ;
-#endif
-
-   const Real_t delt = domain.deltatime() ;
-   Real_t u_cut = domain.u_cut() ;
-
-  /* time of boundary condition evaluation is beginning of step for force and
-   * acceleration boundary conditions. */
-  CalcForceForNodes(domain);
-
-#if USE_MPI  
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-   CommRecv(domain, MSG_SYNC_POS_VEL, 6,
-            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
-            false, false) ;
-#endif
-#endif
-   
-   CalcAccelerationForNodes(domain, domain.numNode());
-   
-   ApplyAccelerationBoundaryConditionsForNodes(domain);
-
-   CalcVelocityForNodes( domain, delt, u_cut, domain.numNode()) ;
-
-   CalcPositionForNodes( domain, delt, domain.numNode() );
-#if USE_MPI
-#if defined(SEDOV_SYNC_POS_VEL_EARLY)
-  fieldData[0] = &Domain::x ;
-  fieldData[1] = &Domain::y ;
-  fieldData[2] = &Domain::z ;
-  fieldData[3] = &Domain::xd ;
-  fieldData[4] = &Domain::yd ;
-  fieldData[5] = &Domain::zd ;
-
-   CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
-            false, false) ;
-   CommSyncPosVel(domain) ;
-#endif
-#endif
-   
-  return;
-}
-
-/******************************************/
-
-static inline
-Real_t CalcElemVolume( const Real_t x0, const Real_t x1,
-               const Real_t x2, const Real_t x3,
-               const Real_t x4, const Real_t x5,
-               const Real_t x6, const Real_t x7,
-               const Real_t y0, const Real_t y1,
-               const Real_t y2, const Real_t y3,
-               const Real_t y4, const Real_t y5,
-               const Real_t y6, const Real_t y7,
-               const Real_t z0, const Real_t z1,
-               const Real_t z2, const Real_t z3,
-               const Real_t z4, const Real_t z5,
-               const Real_t z6, const Real_t z7 )
-{
-  Real_t twelveth = Real_t(1.0)/Real_t(12.0);
-
-  Real_t dx61 = x6 - x1;
-  Real_t dy61 = y6 - y1;
-  Real_t dz61 = z6 - z1;
-
-  Real_t dx70 = x7 - x0;
-  Real_t dy70 = y7 - y0;
-  Real_t dz70 = z7 - z0;
-
-  Real_t dx63 = x6 - x3;
-  Real_t dy63 = y6 - y3;
-  Real_t dz63 = z6 - z3;
-
-  Real_t dx20 = x2 - x0;
-  Real_t dy20 = y2 - y0;
-  Real_t dz20 = z2 - z0;
-
-  Real_t dx50 = x5 - x0;
-  Real_t dy50 = y5 - y0;
-  Real_t dz50 = z5 - z0;
-
-  Real_t dx64 = x6 - x4;
-  Real_t dy64 = y6 - y4;
-  Real_t dz64 = z6 - z4;
-
-  Real_t dx31 = x3 - x1;
-  Real_t dy31 = y3 - y1;
-  Real_t dz31 = z3 - z1;
-
-  Real_t dx72 = x7 - x2;
-  Real_t dy72 = y7 - y2;
-  Real_t dz72 = z7 - z2;
-
-  Real_t dx43 = x4 - x3;
-  Real_t dy43 = y4 - y3;
-  Real_t dz43 = z4 - z3;
-
-  Real_t dx57 = x5 - x7;
-  Real_t dy57 = y5 - y7;
-  Real_t dz57 = z5 - z7;
-
-  Real_t dx14 = x1 - x4;
-  Real_t dy14 = y1 - y4;
-  Real_t dz14 = z1 - z4;
-
-  Real_t dx25 = x2 - x5;
-  Real_t dy25 = y2 - y5;
-  Real_t dz25 = z2 - z5;
-
-#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \
-   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))
-
-  Real_t volume =
-    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,
-       dy31 + dy72, dy63, dy20,
-       dz31 + dz72, dz63, dz20) +
-    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,
-       dy43 + dy57, dy64, dy70,
-       dz43 + dz57, dz64, dz70) +
-    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,
-       dy14 + dy25, dy61, dy50,
-       dz14 + dz25, dz61, dz50);
-
-#undef TRIPLE_PRODUCT
-
-  volume *= twelveth;
-
-  return volume ;
-}
-
-/******************************************/
-
-//inline
-Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )
-{
-return CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
-                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],
-                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);
-}
-
-/******************************************/
-
-static inline
-Real_t AreaFace( const Real_t x0, const Real_t x1,
-                 const Real_t x2, const Real_t x3,
-                 const Real_t y0, const Real_t y1,
-                 const Real_t y2, const Real_t y3,
-                 const Real_t z0, const Real_t z1,
-                 const Real_t z2, const Real_t z3)
-{
-   Real_t fx = (x2 - x0) - (x3 - x1);
-   Real_t fy = (y2 - y0) - (y3 - y1);
-   Real_t fz = (z2 - z0) - (z3 - z1);
-   Real_t gx = (x2 - x0) + (x3 - x1);
-   Real_t gy = (y2 - y0) + (y3 - y1);
-   Real_t gz = (z2 - z0) + (z3 - z1);
-   Real_t area =
-      (fx * fx + fy * fy + fz * fz) *
-      (gx * gx + gy * gy + gz * gz) -
-      (fx * gx + fy * gy + fz * gz) *
-      (fx * gx + fy * gy + fz * gz);
-   return area ;
-}
-
-/******************************************/
-
-static inline
-Real_t CalcElemCharacteristicLength( const Real_t x[8],
-                                     const Real_t y[8],
-                                     const Real_t z[8],
-                                     const Real_t volume)
-{
-   Real_t a, charLength = Real_t(0.0);
-
-   a = AreaFace(x[0],x[1],x[2],x[3],
-                y[0],y[1],y[2],y[3],
-                z[0],z[1],z[2],z[3]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[4],x[5],x[6],x[7],
-                y[4],y[5],y[6],y[7],
-                z[4],z[5],z[6],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[0],x[1],x[5],x[4],
-                y[0],y[1],y[5],y[4],
-                z[0],z[1],z[5],z[4]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[1],x[2],x[6],x[5],
-                y[1],y[2],y[6],y[5],
-                z[1],z[2],z[6],z[5]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[2],x[3],x[7],x[6],
-                y[2],y[3],y[7],y[6],
-                z[2],z[3],z[7],z[6]) ;
-   charLength = std::max(a,charLength) ;
-
-   a = AreaFace(x[3],x[0],x[4],x[7],
-                y[3],y[0],y[4],y[7],
-                z[3],z[0],z[4],z[7]) ;
-   charLength = std::max(a,charLength) ;
-
-   charLength = Real_t(4.0) * volume / SQRT(charLength);
-
-   return charLength;
-}
-
-/******************************************/
-
-static inline
-void CalcElemVelocityGradient( const Real_t* const xvel,
-                                const Real_t* const yvel,
-                                const Real_t* const zvel,
-                                const Real_t b[][8],
-                                const Real_t detJ,
-                                Real_t* const d )
-{
-  const Real_t inv_detJ = Real_t(1.0) / detJ ;
-  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;
-  const Real_t* const pfx = b[0];
-  const Real_t* const pfy = b[1];
-  const Real_t* const pfz = b[2];
-
-  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])
-                     + pfx[1] * (xvel[1]-xvel[7])
-                     + pfx[2] * (xvel[2]-xvel[4])
-                     + pfx[3] * (xvel[3]-xvel[5]) );
-
-  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])
-                     + pfy[1] * (yvel[1]-yvel[7])
-                     + pfy[2] * (yvel[2]-yvel[4])
-                     + pfy[3] * (yvel[3]-yvel[5]) );
-
-  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])
-                     + pfz[1] * (zvel[1]-zvel[7])
-                     + pfz[2] * (zvel[2]-zvel[4])
-                     + pfz[3] * (zvel[3]-zvel[5]) );
-
-  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])
-                      + pfx[1] * (yvel[1]-yvel[7])
-                      + pfx[2] * (yvel[2]-yvel[4])
-                      + pfx[3] * (yvel[3]-yvel[5]) );
-
-  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])
-                      + pfy[1] * (xvel[1]-xvel[7])
-                      + pfy[2] * (xvel[2]-xvel[4])
-                      + pfy[3] * (xvel[3]-xvel[5]) );
-
-  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])
-                      + pfx[1] * (zvel[1]-zvel[7])
-                      + pfx[2] * (zvel[2]-zvel[4])
-                      + pfx[3] * (zvel[3]-zvel[5]) );
-
-  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])
-                      + pfz[1] * (xvel[1]-xvel[7])
-                      + pfz[2] * (xvel[2]-xvel[4])
-                      + pfz[3] * (xvel[3]-xvel[5]) );
-
-  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])
-                      + pfy[1] * (zvel[1]-zvel[7])
-                      + pfy[2] * (zvel[2]-zvel[4])
-                      + pfy[3] * (zvel[3]-zvel[5]) );
-
-  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])
-                      + pfz[1] * (yvel[1]-yvel[7])
-                      + pfz[2] * (yvel[2]-yvel[4])
-                      + pfz[3] * (yvel[3]-yvel[5]) );
-  d[5]  = Real_t( .5) * ( dxddy + dyddx );
-  d[4]  = Real_t( .5) * ( dxddz + dzddx );
-  d[3]  = Real_t( .5) * ( dzddy + dyddz );
-}
-
-/******************************************/
-
-//static inline
-void CalcKinematicsForElems( Domain &domain,
-                             Real_t deltaTime, Index_t numElem )
-{
-
-  // loop over all elements
-#pragma omp parallel for firstprivate(numElem, deltaTime)
-  for( Index_t k=0 ; k<numElem ; ++k )
-  {
-    Real_t B[3][8] ; /** shape function derivatives */
-    Real_t D[6] ;
-    Real_t x_local[8] ;
-    Real_t y_local[8] ;
-    Real_t z_local[8] ;
-    Real_t xd_local[8] ;
-    Real_t yd_local[8] ;
-    Real_t zd_local[8] ;
-    Real_t detJ = Real_t(0.0) ;
-
-    Real_t volume ;
-    Real_t relativeVolume ;
-    const Index_t* const elemToNode = domain.nodelist(k) ;
-
-    // get nodal coordinates from global arrays and copy into local arrays.
-    CollectDomainNodesToElemNodes(domain, elemToNode, x_local, y_local, z_local);
-
-    // volume calculations
-    volume = CalcElemVolume(x_local, y_local, z_local );
-    relativeVolume = volume / domain.volo(k) ;
-    domain.vnew(k) = relativeVolume ;
-    domain.delv(k) = relativeVolume - domain.v(k) ;
-
-    // set characteristic length
-    domain.arealg(k) = CalcElemCharacteristicLength(x_local, y_local, z_local,
-                                             volume);
-
-    // get nodal velocities from global array and copy into local arrays.
-    for( Index_t lnode=0 ; lnode<8 ; ++lnode )
-    {
-      Index_t gnode = elemToNode[lnode];
-      xd_local[lnode] = domain.xd(gnode);
-      yd_local[lnode] = domain.yd(gnode);
-      zd_local[lnode] = domain.zd(gnode);
-    }
-
-    Real_t dt2 = Real_t(0.5) * deltaTime;
-    for ( Index_t j=0 ; j<8 ; ++j )
-    {
-       x_local[j] -= dt2 * xd_local[j];
-       y_local[j] -= dt2 * yd_local[j];
-       z_local[j] -= dt2 * zd_local[j];
-    }
-
-    CalcElemShapeFunctionDerivatives( x_local, y_local, z_local,
-                                      B, &detJ );
-
-    CalcElemVelocityGradient( xd_local, yd_local, zd_local,
-                               B, detJ, D );
-
-    // put velocity gradient quantities into their global arrays.
-    domain.dxx(k) = D[0];
-    domain.dyy(k) = D[1];
-    domain.dzz(k) = D[2];
-  }
-}
-
-/******************************************/
-
-static inline
-void CalcLagrangeElements(Domain& domain)
-{
-   Index_t numElem = domain.numElem() ;
-   if (numElem > 0) {
-      const Real_t deltatime = domain.deltatime() ;
-
-      domain.AllocateStrains(numElem);
-
-      CalcKinematicsForElems(domain, deltatime, numElem) ;
-
-      // element loop to do some stuff not included in the elemlib function.
-#pragma omp parallel for firstprivate(numElem)
-      for ( Index_t k=0 ; k<numElem ; ++k )
-      {
-         // calc strain rate and apply as constraint (only done in FB element)
-         Real_t vdov = domain.dxx(k) + domain.dyy(k) + domain.dzz(k) ;
-         Real_t vdovthird = vdov/Real_t(3.0) ;
-
-         // make the rate of deformation tensor deviatoric
-         domain.vdov(k) = vdov ;
-         domain.dxx(k) -= vdovthird ;
-         domain.dyy(k) -= vdovthird ;
-         domain.dzz(k) -= vdovthird ;
-
-        // See if any volumes are negative, and take appropriate action.
-         if (domain.vnew(k) <= Real_t(0.0))
-        {
-#if USE_MPI           
-           MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-           exit(VolumeError);
-#endif
-        }
-      }
-      domain.DeallocateStrains();
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcMonotonicQGradientsForElems(Domain& domain)
-{
-   Index_t numElem = domain.numElem();
-
-#pragma omp parallel for firstprivate(numElem)
-   for (Index_t i = 0 ; i < numElem ; ++i ) {
-      const Real_t ptiny = Real_t(1.e-36) ;
-      Real_t ax,ay,az ;
-      Real_t dxv,dyv,dzv ;
-
-      const Index_t *elemToNode = domain.nodelist(i);
-      Index_t n0 = elemToNode[0] ;
-      Index_t n1 = elemToNode[1] ;
-      Index_t n2 = elemToNode[2] ;
-      Index_t n3 = elemToNode[3] ;
-      Index_t n4 = elemToNode[4] ;
-      Index_t n5 = elemToNode[5] ;
-      Index_t n6 = elemToNode[6] ;
-      Index_t n7 = elemToNode[7] ;
-
-      Real_t x0 = domain.x(n0) ;
-      Real_t x1 = domain.x(n1) ;
-      Real_t x2 = domain.x(n2) ;
-      Real_t x3 = domain.x(n3) ;
-      Real_t x4 = domain.x(n4) ;
-      Real_t x5 = domain.x(n5) ;
-      Real_t x6 = domain.x(n6) ;
-      Real_t x7 = domain.x(n7) ;
-
-      Real_t y0 = domain.y(n0) ;
-      Real_t y1 = domain.y(n1) ;
-      Real_t y2 = domain.y(n2) ;
-      Real_t y3 = domain.y(n3) ;
-      Real_t y4 = domain.y(n4) ;
-      Real_t y5 = domain.y(n5) ;
-      Real_t y6 = domain.y(n6) ;
-      Real_t y7 = domain.y(n7) ;
-
-      Real_t z0 = domain.z(n0) ;
-      Real_t z1 = domain.z(n1) ;
-      Real_t z2 = domain.z(n2) ;
-      Real_t z3 = domain.z(n3) ;
-      Real_t z4 = domain.z(n4) ;
-      Real_t z5 = domain.z(n5) ;
-      Real_t z6 = domain.z(n6) ;
-      Real_t z7 = domain.z(n7) ;
-
-      Real_t xv0 = domain.xd(n0) ;
-      Real_t xv1 = domain.xd(n1) ;
-      Real_t xv2 = domain.xd(n2) ;
-      Real_t xv3 = domain.xd(n3) ;
-      Real_t xv4 = domain.xd(n4) ;
-      Real_t xv5 = domain.xd(n5) ;
-      Real_t xv6 = domain.xd(n6) ;
-      Real_t xv7 = domain.xd(n7) ;
-
-      Real_t yv0 = domain.yd(n0) ;
-      Real_t yv1 = domain.yd(n1) ;
-      Real_t yv2 = domain.yd(n2) ;
-      Real_t yv3 = domain.yd(n3) ;
-      Real_t yv4 = domain.yd(n4) ;
-      Real_t yv5 = domain.yd(n5) ;
-      Real_t yv6 = domain.yd(n6) ;
-      Real_t yv7 = domain.yd(n7) ;
-
-      Real_t zv0 = domain.zd(n0) ;
-      Real_t zv1 = domain.zd(n1) ;
-      Real_t zv2 = domain.zd(n2) ;
-      Real_t zv3 = domain.zd(n3) ;
-      Real_t zv4 = domain.zd(n4) ;
-      Real_t zv5 = domain.zd(n5) ;
-      Real_t zv6 = domain.zd(n6) ;
-      Real_t zv7 = domain.zd(n7) ;
-
-      Real_t vol = domain.volo(i)*domain.vnew(i) ;
-      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;
-
-      Real_t dxj = Real_t(-0.25)*((x0+x1+x5+x4) - (x3+x2+x6+x7)) ;
-      Real_t dyj = Real_t(-0.25)*((y0+y1+y5+y4) - (y3+y2+y6+y7)) ;
-      Real_t dzj = Real_t(-0.25)*((z0+z1+z5+z4) - (z3+z2+z6+z7)) ;
-
-      Real_t dxi = Real_t( 0.25)*((x1+x2+x6+x5) - (x0+x3+x7+x4)) ;
-      Real_t dyi = Real_t( 0.25)*((y1+y2+y6+y5) - (y0+y3+y7+y4)) ;
-      Real_t dzi = Real_t( 0.25)*((z1+z2+z6+z5) - (z0+z3+z7+z4)) ;
-
-      Real_t dxk = Real_t( 0.25)*((x4+x5+x6+x7) - (x0+x1+x2+x3)) ;
-      Real_t dyk = Real_t( 0.25)*((y4+y5+y6+y7) - (y0+y1+y2+y3)) ;
-      Real_t dzk = Real_t( 0.25)*((z4+z5+z6+z7) - (z0+z1+z2+z3)) ;
-
-      /* find delvk and delxk ( i cross j ) */
-
-      ax = dyi*dzj - dzi*dyj ;
-      ay = dzi*dxj - dxi*dzj ;
-      az = dxi*dyj - dyi*dxj ;
-
-      domain.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv4+xv5+xv6+xv7) - (xv0+xv1+xv2+xv3)) ;
-      dyv = Real_t(0.25)*((yv4+yv5+yv6+yv7) - (yv0+yv1+yv2+yv3)) ;
-      dzv = Real_t(0.25)*((zv4+zv5+zv6+zv7) - (zv0+zv1+zv2+zv3)) ;
-
-      domain.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxi and delvi ( j cross k ) */
-
-      ax = dyj*dzk - dzj*dyk ;
-      ay = dzj*dxk - dxj*dzk ;
-      az = dxj*dyk - dyj*dxk ;
-
-      domain.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(0.25)*((xv1+xv2+xv6+xv5) - (xv0+xv3+xv7+xv4)) ;
-      dyv = Real_t(0.25)*((yv1+yv2+yv6+yv5) - (yv0+yv3+yv7+yv4)) ;
-      dzv = Real_t(0.25)*((zv1+zv2+zv6+zv5) - (zv0+zv3+zv7+zv4)) ;
-
-      domain.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;
-
-      /* find delxj and delvj ( k cross i ) */
-
-      ax = dyk*dzi - dzk*dyi ;
-      ay = dzk*dxi - dxk*dzi ;
-      az = dxk*dyi - dyk*dxi ;
-
-      domain.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;
-
-      ax *= norm ;
-      ay *= norm ;
-      az *= norm ;
-
-      dxv = Real_t(-0.25)*((xv0+xv1+xv5+xv4) - (xv3+xv2+xv6+xv7)) ;
-      dyv = Real_t(-0.25)*((yv0+yv1+yv5+yv4) - (yv3+yv2+yv6+yv7)) ;
-      dzv = Real_t(-0.25)*((zv0+zv1+zv5+zv4) - (zv3+zv2+zv6+zv7)) ;
-
-      domain.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcMonotonicQRegionForElems(Domain &domain, Int_t r,
-                                  Real_t ptiny)
-{
-   Real_t monoq_limiter_mult = domain.monoq_limiter_mult();
-   Real_t monoq_max_slope = domain.monoq_max_slope();
-   Real_t qlc_monoq = domain.qlc_monoq();
-   Real_t qqc_monoq = domain.qqc_monoq();
-
-#pragma omp parallel for firstprivate(qlc_monoq, qqc_monoq, monoq_limiter_mult, monoq_max_slope, ptiny)
-   for ( Index_t i = 0 ; i < domain.regElemSize(r); ++i ) {
-      Index_t ielem = domain.regElemlist(r,i);
-      Real_t qlin, qquad ;
-      Real_t phixi, phieta, phizeta ;
-      Int_t bcMask = domain.elemBC(ielem) ;
-      Real_t delvm = 0.0, delvp =0.0;
-
-      /*  phixi     */
-      Real_t norm = Real_t(1.) / (domain.delv_xi(ielem)+ ptiny ) ;
-
-      switch (bcMask & XI_M) {
-         case XI_M_COMM: /* needs comm data */
-         case 0:         delvm = domain.delv_xi(domain.lxim(ielem)); break ;
-         case XI_M_SYMM: delvm = domain.delv_xi(ielem) ;       break ;
-         case XI_M_FREE: delvm = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & XI_P) {
-         case XI_P_COMM: /* needs comm data */
-         case 0:         delvp = domain.delv_xi(domain.lxip(ielem)) ; break ;
-         case XI_P_SYMM: delvp = domain.delv_xi(ielem) ;       break ;
-         case XI_P_FREE: delvp = Real_t(0.0) ;      break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phixi = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm < phixi ) phixi = delvm ;
-      if ( delvp < phixi ) phixi = delvp ;
-      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;
-      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;
-
-
-      /*  phieta     */
-      norm = Real_t(1.) / ( domain.delv_eta(ielem) + ptiny ) ;
-
-      switch (bcMask & ETA_M) {
-         case ETA_M_COMM: /* needs comm data */
-         case 0:          delvm = domain.delv_eta(domain.letam(ielem)) ; break ;
-         case ETA_M_SYMM: delvm = domain.delv_eta(ielem) ;        break ;
-         case ETA_M_FREE: delvm = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ETA_P) {
-         case ETA_P_COMM: /* needs comm data */
-         case 0:          delvp = domain.delv_eta(domain.letap(ielem)) ; break ;
-         case ETA_P_SYMM: delvp = domain.delv_eta(ielem) ;        break ;
-         case ETA_P_FREE: delvp = Real_t(0.0) ;        break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phieta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm  < phieta ) phieta = delvm ;
-      if ( delvp  < phieta ) phieta = delvp ;
-      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;
-      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;
-
-      /*  phizeta     */
-      norm = Real_t(1.) / ( domain.delv_zeta(ielem) + ptiny ) ;
-
-      switch (bcMask & ZETA_M) {
-         case ZETA_M_COMM: /* needs comm data */
-         case 0:           delvm = domain.delv_zeta(domain.lzetam(ielem)) ; break ;
-         case ZETA_M_SYMM: delvm = domain.delv_zeta(ielem) ;         break ;
-         case ZETA_M_FREE: delvm = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvm = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-      switch (bcMask & ZETA_P) {
-         case ZETA_P_COMM: /* needs comm data */
-         case 0:           delvp = domain.delv_zeta(domain.lzetap(ielem)) ; break ;
-         case ZETA_P_SYMM: delvp = domain.delv_zeta(ielem) ;         break ;
-         case ZETA_P_FREE: delvp = Real_t(0.0) ;          break ;
-         default:          fprintf(stderr, "Error in switch at %s line %d\n",
-                                   __FILE__, __LINE__);
-            delvp = 0; /* ERROR - but quiets the compiler */
-            break;
-      }
-
-      delvm = delvm * norm ;
-      delvp = delvp * norm ;
-
-      phizeta = Real_t(.5) * ( delvm + delvp ) ;
-
-      delvm *= monoq_limiter_mult ;
-      delvp *= monoq_limiter_mult ;
-
-      if ( delvm   < phizeta ) phizeta = delvm ;
-      if ( delvp   < phizeta ) phizeta = delvp ;
-      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);
-      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;
-
-      /* Remove length scale */
-
-      if ( domain.vdov(ielem) > Real_t(0.) )  {
-         qlin  = Real_t(0.) ;
-         qquad = Real_t(0.) ;
-      }
-      else {
-         Real_t delvxxi   = domain.delv_xi(ielem)   * domain.delx_xi(ielem)   ;
-         Real_t delvxeta  = domain.delv_eta(ielem)  * domain.delx_eta(ielem)  ;
-         Real_t delvxzeta = domain.delv_zeta(ielem) * domain.delx_zeta(ielem) ;
-
-         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;
-         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;
-         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;
-
-         Real_t rho = domain.elemMass(ielem) / (domain.volo(ielem) * domain.vnew(ielem)) ;
-
-         qlin = -qlc_monoq * rho *
-            (  delvxxi   * (Real_t(1.) - phixi) +
-               delvxeta  * (Real_t(1.) - phieta) +
-               delvxzeta * (Real_t(1.) - phizeta)  ) ;
-
-         qquad = qqc_monoq * rho *
-            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +
-               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +
-               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;
-      }
-
-      domain.qq(ielem) = qquad ;
-      domain.ql(ielem) = qlin  ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcMonotonicQForElems(Domain& domain)
-{  
-   //
-   // initialize parameters
-   // 
-   const Real_t ptiny = Real_t(1.e-36) ;
-
-   //
-   // calculate the monotonic q for all regions
-   //
-   for (Index_t r=0 ; r<domain.numReg() ; ++r) {
-      if (domain.regElemSize(r) > 0) {
-         CalcMonotonicQRegionForElems(domain, r, ptiny) ;
-      }
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcQForElems(Domain& domain)
-{
-   //
-   // MONOTONIC Q option
-   //
-
-   Index_t numElem = domain.numElem() ;
-
-   if (numElem != 0) {
-      Int_t allElem = numElem +  /* local elem */
-            2*domain.sizeX()*domain.sizeY() + /* plane ghosts */
-            2*domain.sizeX()*domain.sizeZ() + /* row ghosts */
-            2*domain.sizeY()*domain.sizeZ() ; /* col ghosts */
-
-      domain.AllocateGradients(numElem, allElem);
-
-#if USE_MPI      
-      CommRecv(domain, MSG_MONOQ, 3,
-               domain.sizeX(), domain.sizeY(), domain.sizeZ(),
-               true, true) ;
-#endif      
-
-      /* Calculate velocity gradients */
-      CalcMonotonicQGradientsForElems(domain);
-
-#if USE_MPI      
-      Domain_member fieldData[3] ;
-      
-      /* Transfer veloctiy gradients in the first order elements */
-      /* problem->commElements->Transfer(CommElements::monoQ) ; */
-
-      fieldData[0] = &Domain::delv_xi ;
-      fieldData[1] = &Domain::delv_eta ;
-      fieldData[2] = &Domain::delv_zeta ;
-
-      CommSend(domain, MSG_MONOQ, 3, fieldData,
-               domain.sizeX(), domain.sizeY(), domain.sizeZ(),
-               true, true) ;
-
-      CommMonoQ(domain) ;
-#endif      
-
-      CalcMonotonicQForElems(domain);
-
-      // Free up memory
-      domain.DeallocateGradients();
-
-      /* Don't allow excessive artificial viscosity */
-      Index_t idx = -1; 
-      for (Index_t i=0; i<numElem; ++i) {
-         if ( domain.q(i) > domain.qstop() ) {
-            idx = i ;
-            break ;
-         }
-      }
-
-      if(idx >= 0) {
-#if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
-#else
-         exit(QStopError);
-#endif
-      }
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcPressureForElems(Real_t* p_new, Real_t* bvc,
-                          Real_t* pbvc, Real_t* e_old,
-                          Real_t* compression, Real_t *vnewc,
-                          Real_t pmin,
-                          Real_t p_cut, Real_t eosvmax,
-                          Index_t length, Index_t *regElemList)
-{
-#pragma omp parallel for firstprivate(length)
-   for (Index_t i = 0; i < length ; ++i) {
-      Real_t c1s = Real_t(2.0)/Real_t(3.0) ;
-      bvc[i] = c1s * (compression[i] + Real_t(1.));
-      pbvc[i] = c1s;
-   }
-
-#pragma omp parallel for firstprivate(length, pmin, p_cut, eosvmax)
-   for (Index_t i = 0 ; i < length ; ++i){
-      Index_t ielem = regElemList[i];
-      
-      p_new[i] = bvc[i] * e_old[i] ;
-
-      if    (FABS(p_new[i]) <  p_cut   )
-         p_new[i] = Real_t(0.0) ;
-
-      if    ( vnewc[ielem] >= eosvmax ) /* impossible condition here? */
-         p_new[i] = Real_t(0.0) ;
-
-      if    (p_new[i]       <  pmin)
-         p_new[i]   = pmin ;
-   }
-}
-
-/******************************************/
-
-static inline
-void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,
-                        Real_t* bvc, Real_t* pbvc,
-                        Real_t* p_old, Real_t* e_old, Real_t* q_old,
-                        Real_t* compression, Real_t* compHalfStep,
-                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,
-                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,
-                        Real_t* qq_old, Real_t* ql_old,
-                        Real_t rho0,
-                        Real_t eosvmax,
-                        Index_t length, Index_t *regElemList)
-{
-   Real_t *pHalfStep = Allocate<Real_t>(length) ;
-
-#pragma omp parallel for firstprivate(length, emin)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])
-         + Real_t(0.5) * work[i];
-
-      if (e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-#pragma omp parallel for firstprivate(length, rho0)
-   for (Index_t i = 0 ; i < length ; ++i) {
-      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;
-
-      if ( delvc[i] > Real_t(0.) ) {
-         q_new[i] /* = qq_old[i] = ql_old[i] */ = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]
-         * (  Real_t(3.0)*(p_old[i]     + q_old[i])
-              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;
-   }
-
-#pragma omp parallel for firstprivate(length, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i) {
-
-      e_new[i] += Real_t(0.5) * work[i];
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-#pragma omp parallel for firstprivate(length, rho0, emin, e_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-      const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;
-      Index_t ielem = regElemList[i];
-      Real_t q_tilde ;
-
-      if (delvc[i] > Real_t(0.)) {
-         q_tilde = Real_t(0.) ;
-      }
-      else {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
-      }
-
-      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])
-                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])
-                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;
-
-      if (FABS(e_new[i]) < e_cut) {
-         e_new[i] = Real_t(0.)  ;
-      }
-      if (     e_new[i]  < emin ) {
-         e_new[i] = emin ;
-      }
-   }
-
-   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,
-                        pmin, p_cut, eosvmax, length, regElemList);
-
-#pragma omp parallel for firstprivate(length, rho0, q_cut)
-   for (Index_t i = 0 ; i < length ; ++i){
-      Index_t ielem = regElemList[i];
-
-      if ( delvc[i] <= Real_t(0.) ) {
-         Real_t ssc = ( pbvc[i] * e_new[i]
-                 + vnewc[ielem] * vnewc[ielem] * bvc[i] * p_new[i] ) / rho0 ;
-
-         if ( ssc <= Real_t(.1111111e-36) ) {
-            ssc = Real_t(.3333333e-18) ;
-         } else {
-            ssc = SQRT(ssc) ;
-         }
-
-         q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
-
-         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;
-      }
-   }
-
-   Release(&pHalfStep) ;
-
-   return ;
-}
-
-/******************************************/
-
-static inline
-void CalcSoundSpeedForElems(Domain &domain,
-                            Real_t *vnewc, Real_t rho0, Real_t *enewc,
-                            Real_t *pnewc, Real_t *pbvc,
-                            Real_t *bvc, Real_t ss4o3,
-                            Index_t len, Index_t *regElemList)
-{
-#pragma omp parallel for firstprivate(rho0, ss4o3)
-   for (Index_t i = 0; i < len ; ++i) {
-      Index_t ielem = regElemList[i];
-      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[ielem] * vnewc[ielem] *
-                 bvc[i] * pnewc[i]) / rho0;
-      if (ssTmp <= Real_t(.1111111e-36)) {
-         ssTmp = Real_t(.3333333e-18);
-      }
-      else {
-         ssTmp = SQRT(ssTmp);
-      }
-      domain.ss(ielem) = ssTmp ;
-   }
-}
-
-/******************************************/
-
-static inline
-void EvalEOSForElems(Domain& domain, Real_t *vnewc,
-                     Int_t numElemReg, Index_t *regElemList, Int_t rep)
-{
-   Real_t  e_cut = domain.e_cut() ;
-   Real_t  p_cut = domain.p_cut() ;
-   Real_t  ss4o3 = domain.ss4o3() ;
-   Real_t  q_cut = domain.q_cut() ;
-
-   Real_t eosvmax = domain.eosvmax() ;
-   Real_t eosvmin = domain.eosvmin() ;
-   Real_t pmin    = domain.pmin() ;
-   Real_t emin    = domain.emin() ;
-   Real_t rho0    = domain.refdens() ;
-
-   // These temporaries will be of different size for 
-   // each call (due to different sized region element
-   // lists)
-   Real_t *e_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *delvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *compression = Allocate<Real_t>(numElemReg) ;
-   Real_t *compHalfStep = Allocate<Real_t>(numElemReg) ;
-   Real_t *qq_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *ql_old = Allocate<Real_t>(numElemReg) ;
-   Real_t *work = Allocate<Real_t>(numElemReg) ;
-   Real_t *p_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *e_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *q_new = Allocate<Real_t>(numElemReg) ;
-   Real_t *bvc = Allocate<Real_t>(numElemReg) ;
-   Real_t *pbvc = Allocate<Real_t>(numElemReg) ;
- 
-   //loop to add load imbalance based on region number 
-   for(Int_t j = 0; j < rep; j++) {
-      /* compress data, minimal set */
-#pragma omp parallel
-      {
-#pragma omp for nowait firstprivate(numElemReg)
-         for (Index_t i=0; i<numElemReg; ++i) {
-            Index_t ielem = regElemList[i];
-            e_old[i] = domain.e(ielem) ;
-            delvc[i] = domain.delv(ielem) ;
-            p_old[i] = domain.p(ielem) ;
-            q_old[i] = domain.q(ielem) ;
-            qq_old[i] = domain.qq(ielem) ;
-            ql_old[i] = domain.ql(ielem) ;
-         }
-
-#pragma omp for firstprivate(numElemReg)
-         for (Index_t i = 0; i < numElemReg ; ++i) {
-            Index_t ielem = regElemList[i];
-            Real_t vchalf ;
-            compression[i] = Real_t(1.) / vnewc[ielem] - Real_t(1.);
-            vchalf = vnewc[ielem] - delvc[i] * Real_t(.5);
-            compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);
-         }
-
-      /* Check for v > eosvmax or v < eosvmin */
-         if ( eosvmin != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(numElemReg, eosvmin)
-            for(Index_t i=0 ; i<numElemReg ; ++i) {
-               Index_t ielem = regElemList[i];
-               if (vnewc[ielem] <= eosvmin) { /* impossible due to calling func? */
-                  compHalfStep[i] = compression[i] ;
-               }
-            }
-         }
-         if ( eosvmax != Real_t(0.) ) {
-#pragma omp for nowait firstprivate(numElemReg, eosvmax)
-            for(Index_t i=0 ; i<numElemReg ; ++i) {
-               Index_t ielem = regElemList[i];
-               if (vnewc[ielem] >= eosvmax) { /* impossible due to calling func? */
-                  p_old[i]        = Real_t(0.) ;
-                  compression[i]  = Real_t(0.) ;
-                  compHalfStep[i] = Real_t(0.) ;
-               }
-            }
-         }
-
-#pragma omp for nowait firstprivate(numElemReg)
-         for (Index_t i = 0 ; i < numElemReg ; ++i) {
-            work[i] = Real_t(0.) ; 
-         }
-      }
-      CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,
-                         p_old, e_old,  q_old, compression, compHalfStep,
-                         vnewc, work,  delvc, pmin,
-                         p_cut, e_cut, q_cut, emin,
-                         qq_old, ql_old, rho0, eosvmax,
-                         numElemReg, regElemList);
-   }
-
-#pragma omp parallel for firstprivate(numElemReg)
-   for (Index_t i=0; i<numElemReg; ++i) {
-      Index_t ielem = regElemList[i];
-      domain.p(ielem) = p_new[i] ;
-      domain.e(ielem) = e_new[i] ;
-      domain.q(ielem) = q_new[i] ;
-   }
-
-   CalcSoundSpeedForElems(domain,
-                          vnewc, rho0, e_new, p_new,
-                          pbvc, bvc, ss4o3,
-                          numElemReg, regElemList) ;
-
-   Release(&pbvc) ;
-   Release(&bvc) ;
-   Release(&q_new) ;
-   Release(&e_new) ;
-   Release(&p_new) ;
-   Release(&work) ;
-   Release(&ql_old) ;
-   Release(&qq_old) ;
-   Release(&compHalfStep) ;
-   Release(&compression) ;
-   Release(&q_old) ;
-   Release(&p_old) ;
-   Release(&delvc) ;
-   Release(&e_old) ;
-}
-
-/******************************************/
-
-static inline
-void ApplyMaterialPropertiesForElems(Domain& domain)
-{
-   Index_t numElem = domain.numElem() ;
-
-  if (numElem != 0) {
-    /* Expose all of the variables needed for material evaluation */
-    Real_t eosvmin = domain.eosvmin() ;
-    Real_t eosvmax = domain.eosvmax() ;
-    Real_t *vnewc = Allocate<Real_t>(numElem) ;
-
-#pragma omp parallel
-    {
-#pragma omp for firstprivate(numElem)
-       for(Index_t i=0 ; i<numElem ; ++i) {
-          vnewc[i] = domain.vnew(i) ;
-       }
-
-       // Bound the updated relative volumes with eosvmin/max
-       if (eosvmin != Real_t(0.)) {
-#pragma omp for nowait firstprivate(numElem)
-          for(Index_t i=0 ; i<numElem ; ++i) {
-             if (vnewc[i] < eosvmin)
-                vnewc[i] = eosvmin ;
-          }
-       }
-
-       if (eosvmax != Real_t(0.)) {
-#pragma omp for nowait firstprivate(numElem)
-          for(Index_t i=0 ; i<numElem ; ++i) {
-             if (vnewc[i] > eosvmax)
-                vnewc[i] = eosvmax ;
-          }
-       }
-
-       // This check may not make perfect sense in LULESH, but
-       // it's representative of something in the full code -
-       // just leave it in, please
-#pragma omp for nowait firstprivate(numElem)
-       for (Index_t i=0; i<numElem; ++i) {
-          Real_t vc = domain.v(i) ;
-          if (eosvmin != Real_t(0.)) {
-             if (vc < eosvmin)
-                vc = eosvmin ;
-          }
-          if (eosvmax != Real_t(0.)) {
-             if (vc > eosvmax)
-                vc = eosvmax ;
-          }
-          if (vc <= 0.) {
-#if USE_MPI
-             MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
-#else
-             exit(VolumeError);
-#endif
-          }
-       }
-    }
-
-    for (Int_t r=0 ; r<domain.numReg() ; r++) {
-       Index_t numElemReg = domain.regElemSize(r);
-       Index_t *regElemList = domain.regElemlist(r);
-       Int_t rep;
-       //Determine load imbalance for this region
-       //round down the number with lowest cost
-       if(r < domain.numReg()/2)
-	 rep = 1;
-       //you don't get an expensive region unless you at least have 5 regions
-       else if(r < (domain.numReg() - (domain.numReg()+15)/20))
-         rep = 1 + domain.cost();
-       //very expensive regions
-       else
-	 rep = 10 * (1+ domain.cost());
-       EvalEOSForElems(domain, vnewc, numElemReg, regElemList, rep);
-    }
-
-    Release(&vnewc) ;
-  }
-}
-
-/******************************************/
-
-static inline
-void UpdateVolumesForElems(Domain &domain,
-                           Real_t v_cut, Index_t length)
-{
-   if (length != 0) {
-#pragma omp parallel for firstprivate(length, v_cut)
-      for(Index_t i=0 ; i<length ; ++i) {
-         Real_t tmpV = domain.vnew(i) ;
-
-         if ( FABS(tmpV - Real_t(1.0)) < v_cut )
-            tmpV = Real_t(1.0) ;
-
-         domain.v(i) = tmpV ;
-      }
-   }
-
-   return ;
-}
-
-/******************************************/
-
-static inline
-void LagrangeElements(Domain& domain, Index_t numElem)
-{
-  CalcLagrangeElements(domain) ;
-
-  /* Calculate Q.  (Monotonic q option requires communication) */
-  CalcQForElems(domain) ;
-
-  ApplyMaterialPropertiesForElems(domain) ;
-
-  UpdateVolumesForElems(domain, 
-                        domain.v_cut(), numElem) ;
-}
-
-/******************************************/
-
-static inline
-void CalcCourantConstraintForElems(Domain &domain, Index_t length,
-                                   Index_t *regElemlist,
-                                   Real_t qqc, Real_t& dtcourant)
-{
-#if 0
-
-#if USE_OMP
-   const Index_t threads = omp_get_max_threads();
-   Index_t courant_elem_per_thread[threads];
-   Real_t dtcourant_per_thread[threads];
-#else
-   Index_t threads = 1;
-   Index_t courant_elem_per_thread[1];
-   Real_t  dtcourant_per_thread[1];
-#endif
-
-#else
-
-#if USE_OMP
-   const Index_t threads = omp_get_max_threads();
-#else
-   Index_t threads = 1;
-#endif
-   std::vector<Index_t> courant_elem_per_thread(threads);
-   std::vector<Real_t> dtcourant_per_thread(threads);
-
-#endif
-
-#pragma omp parallel firstprivate(length, qqc)
-   {
-      Real_t   qqc2 = Real_t(64.0) * qqc * qqc ;
-      Real_t   dtcourant_tmp = dtcourant;
-      Index_t  courant_elem  = -1 ;
-
-#if USE_OMP
-      Index_t thread_num = omp_get_thread_num();
-#else
-      Index_t thread_num = 0;
-#endif      
-
-#pragma omp for 
-      for (Index_t i = 0 ; i < length ; ++i) {
-         Index_t indx = regElemlist[i] ;
-         Real_t dtf = domain.ss(indx) * domain.ss(indx) ;
-
-         if ( domain.vdov(indx) < Real_t(0.) ) {
-            dtf = dtf
-                + qqc2 * domain.arealg(indx) * domain.arealg(indx)
-                * domain.vdov(indx) * domain.vdov(indx) ;
-         }
-
-         dtf = SQRT(dtf) ;
-         dtf = domain.arealg(indx) / dtf ;
-
-         if (domain.vdov(indx) != Real_t(0.)) {
-            if ( dtf < dtcourant_tmp ) {
-               dtcourant_tmp = dtf ;
-               courant_elem  = indx ;
-            }
-         }
-      }
-
-      dtcourant_per_thread[thread_num]    = dtcourant_tmp ;
-      courant_elem_per_thread[thread_num] = courant_elem ;
-   }
-
-   for (Index_t i = 1; i < threads; ++i) {
-      if (dtcourant_per_thread[i] < dtcourant_per_thread[0] ) {
-         dtcourant_per_thread[0]    = dtcourant_per_thread[i];
-         courant_elem_per_thread[0] = courant_elem_per_thread[i];
-      }
-   }
-
-   if (courant_elem_per_thread[0] != -1) {
-      dtcourant = dtcourant_per_thread[0] ;
-   }
-
-   return ;
-
-}
-
-/******************************************/
-
-static inline
-void CalcHydroConstraintForElems(Domain &domain, Index_t length,
-                                 Index_t *regElemlist, Real_t dvovmax, Real_t& dthydro)
-{
-#if 0
-
-#if USE_OMP
-   const Index_t threads = omp_get_max_threads();
-   Index_t hydro_elem_per_thread[threads];
-   Real_t dthydro_per_thread[threads];
-#else
-   Index_t threads = 1;
-   Index_t hydro_elem_per_thread[1];
-   Real_t  dthydro_per_thread[1];
-#endif
-
-#else
-
-#if USE_OMP
-   const Index_t threads = omp_get_max_threads();
-#else
-   Index_t threads = 1;
-#endif
-   std::vector<Index_t> hydro_elem_per_thread(threads);
-   std::vector<Real_t> dthydro_per_thread(threads);
-
-#endif
-
-#pragma omp parallel firstprivate(length, dvovmax)
-   {
-      Real_t dthydro_tmp = dthydro ;
-      Index_t hydro_elem = -1 ;
-
-#if USE_OMP      
-      Index_t thread_num = omp_get_thread_num();
-#else      
-      Index_t thread_num = 0;
-#endif      
-
-#pragma omp for
-      for (Index_t i = 0 ; i < length ; ++i) {
-         Index_t indx = regElemlist[i] ;
-
-         if (domain.vdov(indx) != Real_t(0.)) {
-            Real_t dtdvov = dvovmax / (FABS(domain.vdov(indx))+Real_t(1.e-20)) ;
-
-            if ( dthydro_tmp > dtdvov ) {
-                  dthydro_tmp = dtdvov ;
-                  hydro_elem = indx ;
-            }
-         }
-      }
-
-      dthydro_per_thread[thread_num]    = dthydro_tmp ;
-      hydro_elem_per_thread[thread_num] = hydro_elem ;
-   }
-
-   for (Index_t i = 1; i < threads; ++i) {
-      if(dthydro_per_thread[i] < dthydro_per_thread[0]) {
-         dthydro_per_thread[0]    = dthydro_per_thread[i];
-         hydro_elem_per_thread[0] =  hydro_elem_per_thread[i];
-      }
-   }
-
-   if (hydro_elem_per_thread[0] != -1) {
-      dthydro =  dthydro_per_thread[0] ;
-   }
-
-   return ;
-}
-
-/******************************************/
-
-static inline
-void CalcTimeConstraintsForElems(Domain& domain) {
-
-   // Initialize conditions to a very large value
-   domain.dtcourant() = 1.0e+20;
-   domain.dthydro() = 1.0e+20;
-
-   for (Index_t r=0 ; r < domain.numReg() ; ++r) {
-      /* evaluate time constraint */
-      CalcCourantConstraintForElems(domain, domain.regElemSize(r),
-                                    domain.regElemlist(r),
-                                    domain.qqc(),
-                                    domain.dtcourant()) ;
-
-      /* check hydro constraint */
-      CalcHydroConstraintForElems(domain, domain.regElemSize(r),
-                                  domain.regElemlist(r),
-                                  domain.dvovmax(),
-                                  domain.dthydro()) ;
-   }
-}
-
-/******************************************/
-
-static inline
-void LagrangeLeapFrog(Domain& domain)
-{
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   Domain_member fieldData[6] ;
-#endif
-
-   /* calculate nodal forces, accelerations, velocities, positions, with
-    * applied boundary conditions and slide surface considerations */
-   LagrangeNodal(domain);
-
-
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-#endif
-
-   /* calculate element quantities (i.e. velocity gradient & q), and update
-    * material states */
-   LagrangeElements(domain, domain.numElem());
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommRecv(domain, MSG_SYNC_POS_VEL, 6,
-            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
-            false, false) ;
-
-   fieldData[0] = &Domain::x ;
-   fieldData[1] = &Domain::y ;
-   fieldData[2] = &Domain::z ;
-   fieldData[3] = &Domain::xd ;
-   fieldData[4] = &Domain::yd ;
-   fieldData[5] = &Domain::zd ;
-   
-   CommSend(domain, MSG_SYNC_POS_VEL, 6, fieldData,
-            domain.sizeX() + 1, domain.sizeY() + 1, domain.sizeZ() + 1,
-            false, false) ;
-#endif
-#endif   
-
-   CalcTimeConstraintsForElems(domain);
-
-#if USE_MPI   
-#if defined(SEDOV_SYNC_POS_VEL_LATE)
-   CommSyncPosVel(domain) ;
-#endif
-#endif   
-}
-
-
-/******************************************/
-
-int main(int argc, char *argv[])
-{
-  Domain *locDom ;
-   Int_t numRanks ;
-   Int_t myRank ;
-   struct cmdLineOpts opts;
-
-#if USE_MPI   
-   Domain_member fieldData ;
-
-   MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
-#else
-   numRanks = 1;
-   myRank = 0;
-#endif   
-
-   /* Set defaults that can be overridden by command line opts */
-   opts.its = 9999999;
-   opts.nx  = 30;
-   opts.numReg = 11;
-   opts.numFiles = (int)(numRanks+10)/9;
-   opts.showProg = 0;
-   opts.quiet = 0;
-   opts.viz = 0;
-   opts.balance = 1;
-   opts.cost = 1;
-
-   ParseCommandLineOptions(argc, argv, myRank, &opts);
-
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      printf("Running problem size %d^3 per domain until completion\n", opts.nx);
-      printf("Num processors: %d\n", numRanks);
-#if USE_OMP
-      printf("Num threads: %d\n", omp_get_max_threads());
-#endif
-      printf("Total number of elements: %lld\n\n", (long long int)(numRanks*opts.nx*opts.nx*opts.nx));
-      printf("To run other sizes, use -s <integer>.\n");
-      printf("To run a fixed number of iterations, use -i <integer>.\n");
-      printf("To run a more or less balanced region set, use -b <integer>.\n");
-      printf("To change the relative costs of regions, use -c <integer>.\n");
-      printf("To print out progress, use -p\n");
-      printf("To write an output file for VisIt, use -v\n");
-      printf("See help (-h) for more options\n\n");
-   }
-
-   // Set up the mesh and decompose. Assumes regular cubes for now
-   Int_t col, row, plane, side;
-   InitMeshDecomp(numRanks, myRank, &col, &row, &plane, &side);
-
-   // Build the main data structure and initialize it
-   locDom = new Domain(numRanks, col, row, plane, opts.nx,
-                       side, opts.numReg, opts.balance, opts.cost) ;
-
-
-#if USE_MPI   
-   fieldData = &Domain::nodalMass ;
-
-   // Initial domain boundary communication 
-   CommRecv(*locDom, MSG_COMM_SBN, 1,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() + 1,
-            true, false) ;
-   CommSend(*locDom, MSG_COMM_SBN, 1, &fieldData,
-            locDom->sizeX() + 1, locDom->sizeY() + 1, locDom->sizeZ() +  1,
-            true, false) ;
-   CommSBN(*locDom, 1, &fieldData) ;
-
-   // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
-#endif   
-   
-   // BEGIN timestep to solution */
-#ifdef RAJA_USE_CALIPER
-   RAJA::Timer timer_main; 
-   timer_main.start("timer_main");
-#else
-#if USE_MPI   
-   double start = MPI_Wtime();
-#else
-   timeval start;
-   gettimeofday(&start, NULL) ;
-#endif
-#endif
-//debug to see region sizes
-//   for(Int_t i = 0; i < locDom->numReg(); i++)
-//      std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
-
-      TimeIncrement(*locDom) ;
-      LagrangeLeapFrog(*locDom) ;
-
-      if ((opts.showProg != 0) && (opts.quiet == 0) && (myRank == 0)) {
-         printf("cycle = %d, time = %e, dt=%e\n",
-                locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
-      }
-   }
-
-   // Use reduced max elapsed time
-   double elapsed_time;
-#ifdef RAJA_USE_CALIPER
-   // Use reduced max elapsed time
-   timer_main.stop("timer_main");
-   elapsed_time = timer_main.elapsed();
-#else
-#if USE_MPI   
-   elapsed_time = MPI_Wtime() - start;
-#else
-   timeval end;
-   gettimeofday(&end, NULL) ;
-   elapsed_time = (double)(end.tv_sec - start.tv_sec) + ((double)(end.tv_usec - start.tv_usec))/1000000 ;
-#endif
-#endif
-   double elapsed_timeG;
-#if USE_MPI   
-   MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-   elapsed_timeG = elapsed_time;
-#endif
-
-   // Write out final viz file */
-   if (opts.viz) {
-      DumpToVisit(*locDom, opts.numFiles, myRank, numRanks) ;
-   }
-   
-   if ((myRank == 0) && (opts.quiet == 0)) {
-      VerifyAndWriteFinalOutput(elapsed_timeG, *locDom, opts.nx, numRanks);
-   }
-
-   delete locDom; 
-
-#if USE_MPI
-   MPI_Finalize() ;
-#endif
-
-   return 0 ;
-}
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.h b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.h
deleted file mode 100644
index 29b2c64e7..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh.h
+++ /dev/null
@@ -1,661 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Helper functions
-//////////////////////////////////////////////////////
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Index_t numNode) // Node-centered
-   {
-      m_x.resize(numNode);  // coordinates
-      m_y.resize(numNode);
-      m_z.resize(numNode);
-
-      m_xd.resize(numNode); // velocities
-      m_yd.resize(numNode);
-      m_zd.resize(numNode);
-
-      m_xdd.resize(numNode); // accelerations
-      m_ydd.resize(numNode);
-      m_zdd.resize(numNode);
-
-      m_fx.resize(numNode);  // forces
-      m_fy.resize(numNode);
-      m_fz.resize(numNode);
-
-      m_nodalMass.resize(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Index_t numElem) // Elem-centered
-   {
-      m_nodelist.resize(8*numElem);
-
-      // elem connectivities through face
-      m_lxim.resize(numElem);
-      m_lxip.resize(numElem);
-      m_letam.resize(numElem);
-      m_letap.resize(numElem);
-      m_lzetam.resize(numElem);
-      m_lzetap.resize(numElem);
-
-      m_elemBC.resize(numElem);
-
-      m_e.resize(numElem);
-      m_p.resize(numElem);
-
-      m_q.resize(numElem);
-      m_ql.resize(numElem);
-      m_qq.resize(numElem);
-
-      m_v.resize(numElem);
-
-      m_volo.resize(numElem);
-      m_delv.resize(numElem);
-      m_vdov.resize(numElem);
-
-      m_arealg.resize(numElem);
-
-      m_ss.resize(numElem);
-
-      m_elemMass.resize(numElem);
-
-      m_vnew.resize(numElem) ;
-   }
-
-   void AllocateGradients(Index_t numElem, Index_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.resize(numElem) ;
-      m_delx_eta.resize(numElem) ;
-      m_delx_zeta.resize(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.resize(allElem) ;
-      m_delv_eta.resize(allElem);
-      m_delv_zeta.resize(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Index_t numElem)
-   {
-      m_dxx.resize(numElem) ;
-      m_dyy.resize(numElem) ;
-      m_dzz.resize(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-
-   void AllocateSymmetry(Index_t size)
-   {
-      if (m_colLoc == 0) {
-        m_symmX.resize(size);
-      }
-      if (m_rowLoc == 0) {
-        m_symmY.resize(size);
-      }
-      if (m_planeLoc == 0) {
-        m_symmZ.resize(size);
-      }
-   }
-
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   // Nodes on symmertry planes
-   Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
-   bool symmXempty()          { return m_symmX.empty(); }
-   bool symmYempty()          { return m_symmY.empty(); }
-   bool symmZempty()          { return m_symmZ.empty(); }
-
-   //
-   // Element-centered
-   //
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
-   void SetupThreadSupportStructures();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void SetupCommBuffers(Int_t edgeNodes);
-   void SetupSymmetryPlanes(Int_t edgeNodes);
-   void SetupElementConnectivities(Int_t edgeElems);
-   void SetupBoundaryConditions(Int_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* Node-centered */
-   std::vector<Real_t> m_x ;  /* coordinates */
-   std::vector<Real_t> m_y ;
-   std::vector<Real_t> m_z ;
-
-   std::vector<Real_t> m_xd ; /* velocities */
-   std::vector<Real_t> m_yd ;
-   std::vector<Real_t> m_zd ;
-
-   std::vector<Real_t> m_xdd ; /* accelerations */
-   std::vector<Real_t> m_ydd ;
-   std::vector<Real_t> m_zdd ;
-
-   std::vector<Real_t> m_fx ;  /* forces */
-   std::vector<Real_t> m_fy ;
-   std::vector<Real_t> m_fz ;
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   // Element-centered
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
-   std::vector<Index_t>  m_lxip ;
-   std::vector<Index_t>  m_letam ;
-   std::vector<Index_t>  m_letap ;
-   std::vector<Index_t>  m_lzetam ;
-   std::vector<Index_t>  m_lzetap ;
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   std::vector<Real_t> m_p ;   /* pressure */
-   std::vector<Real_t> m_q ;   /* q */
-   std::vector<Real_t> m_ql ;  /* linear term for q */
-   std::vector<Real_t> m_qq ;  /* quadratic term for q */
-
-   std::vector<Real_t> m_v ;     /* relative volume */
-   std::vector<Real_t> m_volo ;  /* reference volume */
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_ptr.h b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_ptr.h
deleted file mode 100644
index 905ed22dc..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_ptr.h
+++ /dev/null
@@ -1,668 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <stdlib.h>
-#include <math.h>
-#include <vector>
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-typedef Real_t * __restrict__ Real_p ;
-typedef Index_t * __restrict__ Index_p ;
-typedef Int_t * __restrict__ Int_p ;
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Helper functions
-//////////////////////////////////////////////////////
-
-/* might want to add access methods so that memory can be */
-/* better managed, as in luleshFT */
-
-template <typename T>
-inline T *Allocate(size_t size)
-{
-   return static_cast<T *>(malloc(sizeof(T)*size)) ;
-}
-
-template <typename T>
-inline void Release(T **ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-template <typename T>
-inline void Release(T * __restrict__ *ptr)
-{
-   if (*ptr != NULL) {
-      free(*ptr) ;
-      *ptr = NULL ;
-   }
-}
-
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Index_t numNode) // Node-centered
-   {
-      m_x = Allocate<Real_t>(numNode) ; // coordinates
-      m_y = Allocate<Real_t>(numNode) ;
-      m_z = Allocate<Real_t>(numNode) ;
-
-      m_xd = Allocate<Real_t>(numNode) ; // velocities
-      m_yd = Allocate<Real_t>(numNode) ;
-      m_zd = Allocate<Real_t>(numNode) ;
-
-      m_xdd = Allocate<Real_t>(numNode) ; // accelerations
-      m_ydd = Allocate<Real_t>(numNode) ;
-      m_zdd = Allocate<Real_t>(numNode) ;
-
-      m_fx = Allocate<Real_t>(numNode) ; // forces
-      m_fy = Allocate<Real_t>(numNode) ;
-      m_fz = Allocate<Real_t>(numNode) ;
-
-      m_nodalMass = Allocate<Real_t>(numNode) ; // mass
-   }
-
-   void AllocateElemPersistent(Index_t numElem) // Elem-centered
-   {
-      m_nodelist = Allocate<Index_t>(8*numElem) ;
-
-      // elem connectivities through face
-      m_lxim = Allocate<Index_t>(numElem) ;
-      m_lxip = Allocate<Index_t>(numElem) ;
-      m_letam = Allocate<Index_t>(numElem) ;
-      m_letap = Allocate<Index_t>(numElem) ;
-      m_lzetam = Allocate<Index_t>(numElem) ;
-      m_lzetap = Allocate<Index_t>(numElem) ;
-
-      m_elemBC = Allocate<Int_t>(numElem) ;
-
-      m_e = Allocate<Real_t>(numElem) ;
-      m_p = Allocate<Real_t>(numElem) ;
-
-      m_q = Allocate<Real_t>(numElem) ;
-      m_ql = Allocate<Real_t>(numElem) ;
-      m_qq = Allocate<Real_t>(numElem) ;
-
-      m_v = Allocate<Real_t>(numElem) ;
-
-      m_volo = Allocate<Real_t>(numElem) ;
-      m_delv = Allocate<Real_t>(numElem) ;
-      m_vdov = Allocate<Real_t>(numElem) ;
-
-      m_arealg = Allocate<Real_t>(numElem) ;
-
-      m_ss = Allocate<Real_t>(numElem) ;
-
-      m_elemMass = Allocate<Real_t>(numElem) ;
-
-      m_vnew = Allocate<Real_t>(numElem) ;
-   }
-
-   void AllocateGradients(Index_t numElem, Index_t allElem)
-   {
-      // Position gradients
-      m_delx_xi = Allocate<Real_t>(numElem) ;
-      m_delx_eta = Allocate<Real_t>(numElem) ;
-      m_delx_zeta = Allocate<Real_t>(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi = Allocate<Real_t>(allElem) ;
-      m_delv_eta = Allocate<Real_t>(allElem) ;
-      m_delv_zeta = Allocate<Real_t>(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      Release(&m_delv_zeta) ;
-      Release(&m_delv_eta) ;
-      Release(&m_delv_xi) ;
-
-      Release(&m_delx_zeta) ;
-      Release(&m_delx_eta) ;
-      Release(&m_delx_xi) ;
-   }
-
-   void AllocateStrains(Index_t numElem)
-   {
-      m_dxx = Allocate<Real_t>(numElem) ;
-      m_dyy = Allocate<Real_t>(numElem) ;
-      m_dzz = Allocate<Real_t>(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      Release(&m_dzz) ;
-      Release(&m_dyy) ;
-      Release(&m_dxx) ;
-   }
-   
-   void AllocateSymmetry(Index_t size)
-   {
-     m_symmX = ((m_colLoc == 0) ? Allocate<Index_t>(size) : 0 );
-     m_symmY = ((m_rowLoc == 0) ? Allocate<Index_t>(size) : 0 );
-     m_symmZ = ((m_planeLoc == 0) ? Allocate<Index_t>(size) : 0);
-   }
-
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_x[idx] ; }
-   Real_t& y(Index_t idx)    { return m_y[idx] ; }
-   Real_t& z(Index_t idx)    { return m_z[idx] ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
-   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
-   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
-   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
-   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
-   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
-   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   // Nodes on symmertry planes
-   Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
-   bool symmXempty()          { return (m_symmX == 0); }
-   bool symmYempty()          { return (m_symmY == 0); }
-   bool symmZempty()          { return (m_symmZ == 0); }
-
-   //
-   // Element-centered
-   //
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_p  regNumList()            { return &m_regNumList[0] ; }
-   Index_p  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   Index_p  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
-   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
-   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
-   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
-   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
-   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_p[idx] ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_q[idx] ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_v[idx] ; }
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_p nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_p commDataSend ;
-   Real_p commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
-   void SetupThreadSupportStructures();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void SetupCommBuffers(Int_t edgeNodes);
-   void SetupSymmetryPlanes(Int_t edgeNodes);
-   void SetupElementConnectivities(Int_t edgeElems);
-   void SetupBoundaryConditions(Int_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* Node-centered */
-   Real_p m_x ;  /* coordinates */
-   Real_p m_y ;
-   Real_p m_z ;
-
-   Real_p m_xd ; /* velocities */
-   Real_p m_yd ;
-   Real_p m_zd ;
-
-   Real_p m_xdd ; /* accelerations */
-   Real_p m_ydd ;
-   Real_p m_zdd ;
-
-   Real_p m_fx ;  /* forces */
-   Real_p m_fy ;
-   Real_p m_fz ;
-
-   Real_p m_nodalMass ;  /* mass */
-
-   Index_p m_symmX ;  /* symmetry plane nodesets */
-   Index_p m_symmY ;
-   Index_p m_symmZ ;
-
-   // Element-centered
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_p m_regElemSize ;   // Size of region sets
-   Index_p m_regNumList ;    // Region number per domain element
-   Index_p *m_regElemlist ;  // region indexset 
-
-   Index_p  m_nodelist ;     /* elemToNode connectivity */
-
-   Index_p  m_lxim ;  /* element connectivity across each face */
-   Index_p  m_lxip ;
-   Index_p  m_letam ;
-   Index_p  m_letap ;
-   Index_p  m_lzetam ;
-   Index_p  m_lzetap ;
-
-   Int_p    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   Real_p m_dxx ;  /* principal strains -- temporary */
-   Real_p m_dyy ;
-   Real_p m_dzz ;
-
-   Real_p m_delv_xi ;    /* velocity gradient -- temporary */
-   Real_p m_delv_eta ;
-   Real_p m_delv_zeta ;
-
-   Real_p m_delx_xi ;    /* coordinate gradient -- temporary */
-   Real_p m_delx_eta ;
-   Real_p m_delx_zeta ;
-   
-   Real_p m_e ;   /* energy */
-
-   Real_p m_p ;   /* pressure */
-   Real_p m_q ;   /* q */
-   Real_p m_ql ;  /* linear term for q */
-   Real_p m_qq ;  /* quadratic term for q */
-
-   Real_p m_v ;     /* relative volume */
-   Real_p m_volo ;  /* reference volume */
-   Real_p m_vnew ;  /* new relative volume -- temporary */
-   Real_p m_delv ;  /* m_vnew - m_v */
-   Real_p m_vdov ;  /* volume derivative over volume */
-
-   Real_p m_arealg ;  /* characteristic length of an element */
-   
-   Real_p m_ss ;      /* "sound speed" */
-
-   Real_p m_elemMass ;  /* mass */
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_p m_nodeElemStart ;
-   Index_p m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_tuple.h b/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_tuple.h
deleted file mode 100644
index 1e89cf29a..000000000
--- a/test/LULESH-v2.0/LULESH-v2.0_baseline/lulesh_tuple.h
+++ /dev/null
@@ -1,616 +0,0 @@
-#if !defined(USE_MPI)
-# error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
-#endif
-
-
-// OpenMP will be compiled in if this flag is set to 1 AND the compiler beging
-// used supports it (i.e. the _OPENMP symbol is defined)
-//#define USE_OMP 1
-
-#if USE_MPI
-#include <mpi.h>
-
-/*
-   define one of these three symbols:
-
-   SEDOV_SYNC_POS_VEL_NONE
-   SEDOV_SYNC_POS_VEL_EARLY
-   SEDOV_SYNC_POS_VEL_LATE
-*/
-
-#define SEDOV_SYNC_POS_VEL_EARLY 1
-#endif
-
-#include <math.h>
-#include <vector>
-
-//**************************************************
-// Allow flexibility for arithmetic representations 
-//**************************************************
-
-#define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
-
-
-// Precision specification
-typedef float        real4 ;
-typedef double       real8 ;
-typedef long double  real10 ;  // 10 bytes on x86
-
-typedef int    Index_t ; // array subscript and loop index
-typedef real8  Real_t ;  // floating point representation
-typedef int    Int_t ;   // integer representation
-
-enum { VolumeError = -1, QStopError = -2 } ;
-
-inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
-inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
-inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
-
-inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
-inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
-inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
-
-inline real4  FABS(real4  arg) { return fabsf(arg) ; }
-inline real8  FABS(real8  arg) { return fabs(arg) ; }
-inline real10 FABS(real10 arg) { return fabsl(arg) ; }
-
-
-// Stuff needed for boundary conditions
-// 2 BCs on each of 6 hexahedral faces (12 bits)
-#define XI_M        0x00007
-#define XI_M_SYMM   0x00001
-#define XI_M_FREE   0x00002
-#define XI_M_COMM   0x00004
-
-#define XI_P        0x00038
-#define XI_P_SYMM   0x00008
-#define XI_P_FREE   0x00010
-#define XI_P_COMM   0x00020
-
-#define ETA_M       0x001c0
-#define ETA_M_SYMM  0x00040
-#define ETA_M_FREE  0x00080
-#define ETA_M_COMM  0x00100
-
-#define ETA_P       0x00e00
-#define ETA_P_SYMM  0x00200
-#define ETA_P_FREE  0x00400
-#define ETA_P_COMM  0x00800
-
-#define ZETA_M      0x07000
-#define ZETA_M_SYMM 0x01000
-#define ZETA_M_FREE 0x02000
-#define ZETA_M_COMM 0x04000
-
-#define ZETA_P      0x38000
-#define ZETA_P_SYMM 0x08000
-#define ZETA_P_FREE 0x10000
-#define ZETA_P_COMM 0x20000
-
-// MPI Message Tags
-#define MSG_COMM_SBN      1024
-#define MSG_SYNC_POS_VEL  2048
-#define MSG_MONOQ         3072
-
-#define MAX_FIELDS_PER_MPI_COMM 6
-
-// Assume 128 byte coherence
-// Assume Real_t is an "integral power of 2" bytes wide
-#define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
-
-#define CACHE_ALIGN_REAL(n) \
-   (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
-
-//////////////////////////////////////////////////////
-// Primary data structure
-//////////////////////////////////////////////////////
-
-/*
- * The implementation of the data abstraction used for lulesh
- * resides entirely in the Domain class below.  You can change
- * grouping and interleaving of fields here to maximize data layout
- * efficiency for your underlying architecture or compiler.
- *
- * For example, fields can be implemented as STL objects or
- * raw array pointers.  As another example, individual fields
- * m_x, m_y, m_z could be budled into
- *
- *    struct { Real_t x, y, z ; } *m_coord ;
- *
- * allowing accessor functions such as
- *
- *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
- *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
- *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
- */
-
-class Domain {
-
-   public:
-
-   // Constructor
-   Domain(Int_t numRanks, Index_t colLoc,
-          Index_t rowLoc, Index_t planeLoc,
-          Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
-   
-   // Destructor
-   ~Domain();
-
-   //
-   // ALLOCATION
-   //
-
-   void AllocateNodePersistent(Int_t numNode) // Node-centered
-   {
-      m_coord.resize(numNode);  // coordinates
-
-      m_vel.resize(numNode); // velocities
-
-      m_acc.resize(numNode); // accelerations
-
-      m_force.resize(numNode);  // forces
-
-      m_nodalMass.resize(numNode);  // mass
-   }
-
-   void AllocateElemPersistent(Int_t numElem) // Elem-centered
-   {
-      m_nodelist.resize(8*numElem);
-
-      // elem connectivities through face
-      m_faceToElem.resize(numElem);
-
-      m_elemBC.resize(numElem);
-
-      m_e.resize(numElem);
-
-      m_pq.resize(numElem);
-
-      m_qlqq.resize(numElem);
-
-      m_vol.resize(numElem);
-
-      m_delv.resize(numElem);
-      m_vdov.resize(numElem);
-
-      m_arealg.resize(numElem);
-
-      m_ss.resize(numElem);
-
-      m_elemMass.resize(numElem);
-
-      m_vnew.resize(numElem) ;
-   }
-
-   void AllocateGradients(Int_t numElem, Int_t allElem)
-   {
-      // Position gradients
-      m_delx_xi.resize(numElem) ;
-      m_delx_eta.resize(numElem) ;
-      m_delx_zeta.resize(numElem) ;
-
-      // Velocity gradients
-      m_delv_xi.resize(allElem) ;
-      m_delv_eta.resize(allElem);
-      m_delv_zeta.resize(allElem) ;
-   }
-
-   void DeallocateGradients()
-   {
-      m_delx_zeta.clear() ;
-      m_delx_eta.clear() ;
-      m_delx_xi.clear() ;
-
-      m_delv_zeta.clear() ;
-      m_delv_eta.clear() ;
-      m_delv_xi.clear() ;
-   }
-
-   void AllocateStrains(Int_t numElem)
-   {
-      m_dxx.resize(numElem) ;
-      m_dyy.resize(numElem) ;
-      m_dzz.resize(numElem) ;
-   }
-
-   void DeallocateStrains()
-   {
-      m_dzz.clear() ;
-      m_dyy.clear() ;
-      m_dxx.clear() ;
-   }
-   
-   //
-   // ACCESSORS
-   //
-
-   // Node-centered
-
-   // Nodal coordinates
-   Real_t& x(Index_t idx)    { return m_coord[idx].x ; }
-   Real_t& y(Index_t idx)    { return m_coord[idx].y ; }
-   Real_t& z(Index_t idx)    { return m_coord[idx].z ; }
-
-   // Nodal velocities
-   Real_t& xd(Index_t idx)   { return m_vel[idx].x ; }
-   Real_t& yd(Index_t idx)   { return m_vel[idx].y ; }
-   Real_t& zd(Index_t idx)   { return m_vel[idx].z ; }
-
-   // Nodal accelerations
-   Real_t& xdd(Index_t idx)  { return m_acc[idx].x ; }
-   Real_t& ydd(Index_t idx)  { return m_acc[idx].y ; }
-   Real_t& zdd(Index_t idx)  { return m_acc[idx].z ; }
-
-   // Nodal forces
-   Real_t& fx(Index_t idx)   { return m_force[idx].x ; }
-   Real_t& fy(Index_t idx)   { return m_force[idx].y ; }
-   Real_t& fz(Index_t idx)   { return m_force[idx].z ; }
-
-   // Nodal mass
-   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
-
-   // Nodes on symmertry planes
-   Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
-   Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
-   Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
-   bool symmXempty()          { return m_symmX.empty(); }
-   bool symmYempty()          { return m_symmY.empty(); }
-   bool symmZempty()          { return m_symmZ.empty(); }
-
-   //
-   // Element-centered
-   //
-   Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
-   Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
-   Index_t*  regNumList()            { return &m_regNumList[0] ; }
-   Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
-   Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
-
-   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
-
-   // elem connectivities through face
-   Index_t&  lxim(Index_t idx) { return m_faceToElem[idx].lxim ; }
-   Index_t&  lxip(Index_t idx) { return m_faceToElem[idx].lxip ; }
-   Index_t&  letam(Index_t idx) { return m_faceToElem[idx].letam ; }
-   Index_t&  letap(Index_t idx) { return m_faceToElem[idx].letap ; }
-   Index_t&  lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; }
-   Index_t&  lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; }
-
-   // elem face symm/free-surface flag
-   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
-
-   // Principal strains - temporary
-   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
-   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
-   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
-
-   // New relative volume - temporary
-   Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
-
-   // Velocity gradient - temporary
-   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
-   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
-   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
-
-   // Position gradient - temporary
-   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
-   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
-   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
-
-   // Energy
-   Real_t& e(Index_t idx)          { return m_e[idx] ; }
-
-   // Pressure
-   Real_t& p(Index_t idx)          { return m_pq[idx].p ; }
-
-   // Artificial viscosity
-   Real_t& q(Index_t idx)          { return m_pq[idx].q ; }
-
-   // Linear term for q
-   Real_t& ql(Index_t idx)         { return m_qlqq[idx].ql ; }
-   // Quadratic term for q
-   Real_t& qq(Index_t idx)         { return m_qlqq[idx].qq ; }
-
-   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
-
-   // Relative volume
-   Real_t& v(Index_t idx)          { return m_vol[idx].v ; }
-   // Reference volume
-   Real_t& volo(Index_t idx)       { return m_vol[idx].volo ; }
-
-   // volume derivative over volume
-   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
-
-   // Element characteristic length
-   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
-
-   // Sound speed
-   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
-
-   // Element mass
-   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
-
-   Index_t nodeElemCount(Index_t idx)
-   { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
-
-   Index_t *nodeElemCornerList(Index_t idx)
-   { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
-
-   // Parameters 
-
-   // Cutoffs
-   Real_t u_cut() const               { return m_u_cut ; }
-   Real_t e_cut() const               { return m_e_cut ; }
-   Real_t p_cut() const               { return m_p_cut ; }
-   Real_t q_cut() const               { return m_q_cut ; }
-   Real_t v_cut() const               { return m_v_cut ; }
-
-   // Other constants (usually are settable via input file in real codes)
-   Real_t hgcoef() const              { return m_hgcoef ; }
-   Real_t qstop() const               { return m_qstop ; }
-   Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
-   Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
-   Real_t ss4o3() const               { return m_ss4o3 ; }
-   Real_t qlc_monoq() const           { return m_qlc_monoq ; }
-   Real_t qqc_monoq() const           { return m_qqc_monoq ; }
-   Real_t qqc() const                 { return m_qqc ; }
-
-   Real_t eosvmax() const             { return m_eosvmax ; }
-   Real_t eosvmin() const             { return m_eosvmin ; }
-   Real_t pmin() const                { return m_pmin ; }
-   Real_t emin() const                { return m_emin ; }
-   Real_t dvovmax() const             { return m_dvovmax ; }
-   Real_t refdens() const             { return m_refdens ; }
-
-   // Timestep controls, etc...
-   Real_t& time()                 { return m_time ; }
-   Real_t& deltatime()            { return m_deltatime ; }
-   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
-   Real_t& deltatimemultub()      { return m_deltatimemultub ; }
-   Real_t& stoptime()             { return m_stoptime ; }
-   Real_t& dtcourant()            { return m_dtcourant ; }
-   Real_t& dthydro()              { return m_dthydro ; }
-   Real_t& dtmax()                { return m_dtmax ; }
-   Real_t& dtfixed()              { return m_dtfixed ; }
-
-   Int_t&  cycle()                { return m_cycle ; }
-   Index_t&  numRanks()           { return m_numRanks ; }
-
-   Index_t&  colLoc()             { return m_colLoc ; }
-   Index_t&  rowLoc()             { return m_rowLoc ; }
-   Index_t&  planeLoc()           { return m_planeLoc ; }
-   Index_t&  tp()                 { return m_tp ; }
-
-   Index_t&  sizeX()              { return m_sizeX ; }
-   Index_t&  sizeY()              { return m_sizeY ; }
-   Index_t&  sizeZ()              { return m_sizeZ ; }
-   Index_t&  numReg()             { return m_numReg ; }
-   Int_t&  cost()             { return m_cost ; }
-   Index_t&  numElem()            { return m_numElem ; }
-   Index_t&  numNode()            { return m_numNode ; }
-   
-   Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
-   Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
-   
-   //
-   // MPI-Related additional data
-   //
-
-#if USE_MPI   
-   // Communication Work space 
-   Real_t *commDataSend ;
-   Real_t *commDataRecv ;
-   
-   // Maximum number of block neighbors 
-   MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
-   MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
-#endif
-
-  private:
-
-   void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
-   void SetupThreadSupportStructures();
-   void CreateRegionIndexSets(Int_t nreg, Int_t balance);
-   void SetupCommBuffers(Int_t edgeNodes);
-   void SetupSymmetryPlanes(Int_t edgeNodes);
-   void SetupElementConnectivities(Int_t edgeElems);
-   void SetupBoundaryConditions(Int_t edgeElems);
-
-   //
-   // IMPLEMENTATION
-   //
-
-   /* Node-centered */
-
-   struct Tuple3 {
-      Real_t x, y, z ;
-   } ;
-
-   std::vector<Tuple3> m_coord ;  /* coordinates */
-
-   std::vector<Tuple3> m_vel ; /* velocities */
-
-   std::vector<Tuple3> m_acc ; /* accelerations */
-
-   std::vector<Tuple3> m_force ;  /* forces */
-
-   std::vector<Real_t> m_nodalMass ;  /* mass */
-
-   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
-   std::vector<Index_t> m_symmY ;
-   std::vector<Index_t> m_symmZ ;
-
-   // Element-centered
-
-   // Region information
-   Int_t    m_numReg ;
-   Int_t    m_cost; //imbalance cost
-   Index_t *m_regElemSize ;   // Size of region sets
-   Index_t *m_regNumList ;    // Region number per domain element
-   Index_t **m_regElemlist ;  // region indexset 
-
-   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
-
-   struct FaceElemConn {
-      Index_t lxim, lxip, letam, letap, lzetam, lzetap ;
-   } ;
-
-   std::vector<FaceElemConn> m_faceToElem ; /* element conn across faces */
-
-   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
-
-   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
-   std::vector<Real_t> m_dyy ;
-   std::vector<Real_t> m_dzz ;
-
-   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
-   std::vector<Real_t> m_delv_eta ;
-   std::vector<Real_t> m_delv_zeta ;
-
-   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
-   std::vector<Real_t> m_delx_eta ;
-   std::vector<Real_t> m_delx_zeta ;
-   
-   std::vector<Real_t> m_e ;   /* energy */
-
-   struct Pcomponents {
-      Real_t p, q ;
-   } ;
-
-   std::vector<Pcomponents> m_pq ;   /* pressure and artificial viscosity */
-
-   struct Qcomponents {
-      Real_t ql, qq ;
-   } ;
-
-   std::vector<Qcomponents> m_qlqq ;  /* linear and quadratic terms for q */
-
-   struct Volume {
-      Real_t v, volo ;
-   } ;
-
-   std::vector<Volume> m_vol ;     /* relative and reference volume */
-
-   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
-   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
-   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
-
-   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
-   
-   std::vector<Real_t> m_ss ;      /* "sound speed" */
-
-   std::vector<Real_t> m_elemMass ;  /* mass */
-
-   // Cutoffs (treat as constants)
-   const Real_t  m_e_cut ;             // energy tolerance 
-   const Real_t  m_p_cut ;             // pressure tolerance 
-   const Real_t  m_q_cut ;             // q tolerance 
-   const Real_t  m_v_cut ;             // relative volume tolerance 
-   const Real_t  m_u_cut ;             // velocity tolerance 
-
-   // Other constants (usually setable, but hardcoded in this proxy app)
-
-   const Real_t  m_hgcoef ;            // hourglass control 
-   const Real_t  m_ss4o3 ;
-   const Real_t  m_qstop ;             // excessive q indicator 
-   const Real_t  m_monoq_max_slope ;
-   const Real_t  m_monoq_limiter_mult ;
-   const Real_t  m_qlc_monoq ;         // linear term coef for q 
-   const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
-   const Real_t  m_qqc ;
-   const Real_t  m_eosvmax ;
-   const Real_t  m_eosvmin ;
-   const Real_t  m_pmin ;              // pressure floor 
-   const Real_t  m_emin ;              // energy floor 
-   const Real_t  m_dvovmax ;           // maximum allowable volume change 
-   const Real_t  m_refdens ;           // reference density 
-
-   // Variables to keep track of timestep, simulation time, and cycle
-   Real_t  m_dtcourant ;         // courant constraint 
-   Real_t  m_dthydro ;           // volume change constraint 
-   Int_t   m_cycle ;             // iteration count for simulation 
-   Real_t  m_dtfixed ;           // fixed time increment 
-   Real_t  m_time ;              // current time 
-   Real_t  m_deltatime ;         // variable time increment 
-   Real_t  m_deltatimemultlb ;
-   Real_t  m_deltatimemultub ;
-   Real_t  m_dtmax ;             // maximum allowable time increment 
-   Real_t  m_stoptime ;          // end time for simulation 
-
-
-   Int_t   m_numRanks ;
-
-   Index_t m_colLoc ;
-   Index_t m_rowLoc ;
-   Index_t m_planeLoc ;
-   Index_t m_tp ;
-
-   Index_t m_sizeX ;
-   Index_t m_sizeY ;
-   Index_t m_sizeZ ;
-   Index_t m_numElem ;
-   Index_t m_numNode ;
-
-   Index_t m_maxPlaneSize ;
-   Index_t m_maxEdgeSize ;
-
-   // OMP hack 
-   Index_t *m_nodeElemStart ;
-   Index_t *m_nodeElemCornerList ;
-
-   // Used in setup
-   Index_t m_rowMin, m_rowMax;
-   Index_t m_colMin, m_colMax;
-   Index_t m_planeMin, m_planeMax ;
-
-} ;
-
-typedef Real_t &(Domain::* Domain_member )(Index_t) ;
-
-struct cmdLineOpts {
-   Int_t its; // -i 
-   Int_t nx;  // -s 
-   Int_t numReg; // -r 
-   Int_t numFiles; // -f
-   Int_t showProg; // -p
-   Int_t quiet; // -q
-   Int_t viz; // -v 
-   Int_t cost; // -c
-   Int_t balance; // -b
-};
-
-
-
-// Function Prototypes
-
-// lulesh-par
-Real_t CalcElemVolume( const Real_t x[8],
-                       const Real_t y[8],
-                       const Real_t z[8]);
-
-// lulesh-util
-void ParseCommandLineOptions(int argc, char *argv[],
-                             Int_t myRank, struct cmdLineOpts *opts);
-void VerifyAndWriteFinalOutput(Real_t elapsed_time,
-                               Domain& locDom,
-                               Int_t nx,
-                               Int_t numRanks);
-
-// lulesh-viz
-void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
-
-// lulesh-comm
-void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doRecv, bool planeOnly);
-void CommSend(Domain& domain, Int_t msgType,
-              Index_t xferFields, Domain_member *fieldData,
-              Index_t dx, Index_t dy, Index_t dz,
-              bool doSend, bool planeOnly);
-void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
-void CommSyncPosVel(Domain& domain);
-void CommMonoQ(Domain& domain);
-
-// lulesh-init
-void InitMeshDecomp(Int_t numRanks, Int_t myRank,
-                    Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
diff --git a/test/LULESH-v2.0/README b/test/LULESH-v2.0/README
deleted file mode 100644
index d48c8cf3f..000000000
--- a/test/LULESH-v2.0/README
+++ /dev/null
@@ -1,41 +0,0 @@
-##
-## Copyright (c) 2016, Lawrence Livermore National Security, LLC.
-## 
-## Produced at the Lawrence Livermore National Laboratory.
-## 
-## All rights reserved.
-## 
-## For release details and restrictions, please see raja/README-license.txt
-##
-
-#
-# The subdirectories in this directory contain different versions of the
-# LULESH 2.0 proxy app. 
-#
-# The subdirectory LULESH-v2.0_baseline contains a reference version of 
-# LULESH 2.0 that is available at https://codesign.llnl.gov/lulesh.php.
-#
-# The directory LULESH-v2.0_RAJA-variants contains three subdirectories with
-# RAJA variants of LULESH 2.0. They are: 
-#
-# 1) LULESH-v2.0_RAJA-basic contains a basic translation to RAJA that uses
-#    only RAJA forall traversals that take begin-end args or arrays of 
-#    indirection indices. 
-# 2) LULESH-v2.0_RAJA-IndexSet contains a version that uses RAJA IndexSets
-#    similar to LULESH 1.0. It can be run in 3 different variants using RAJA.
-#    See the file luleshPolicy.hxx for more details and how to select the 
-#    execution mode.
-# 3) LULESH-v2.0_RAJA-MICfriendly' contains a version that
-#    uses RAJA IndexSets to permute data and loop iteration ordering in ways
-#    that can be beneficial in a manycore environment.
-#
-# When RAJA is compiled, the default variants of these examples will be
-# generated.
-#
-# RAJA must be built with CUDA enabled to generate GPU variants.
-#
-# NOTE: When running CUDA variants of RAJA LULESH, we advise you to set the
-#       environment variable CUDA_VISIBLE_DEVICES to zero before running.
-#       We are using CUDA Unified Memory and we find that this setting 
-#       greatly improves performance.
-#
-- 
GitLab


From f64d5e5ee2db44317ca65b657835812dff818e75 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 28 Jun 2016 11:39:31 -0700
Subject: [PATCH 7/9] Added CTest support for CPU unit-tests

---
 CMakeLists.txt                                  |  1 +
 test/unit-tests/CPUtests/CMakeLists.txt         | 12 ++++++++++++
 test/unit-tests/CPUtests/main-nested-reduce.cxx |  6 +++++-
 test/unit-tests/CPUtests/main-nested.cxx        |  6 +++++-
 test/unit-tests/CPUtests/main-reduce.cxx        |  6 +++++-
 test/unit-tests/CPUtests/main-traversal.cxx     |  7 +++++++
 6 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8becce5a3..8a645ccbe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,5 +88,6 @@ add_subdirectory(src)
 install(FILES ${PROJECT_BINARY_DIR}/include/RAJA/raja-config.cmake DESTINATION share/cmake/raja)
 
 if(RAJA_ENABLE_TESTS)
+  enable_testing()
   add_subdirectory(test)
 endif()
diff --git a/test/unit-tests/CPUtests/CMakeLists.txt b/test/unit-tests/CPUtests/CMakeLists.txt
index a755b996d..fe754bcc6 100644
--- a/test/unit-tests/CPUtests/CMakeLists.txt
+++ b/test/unit-tests/CPUtests/CMakeLists.txt
@@ -69,10 +69,22 @@ else()
   
   add_executable(CPUnested-test.exe
     main-nested.cxx)
+
+  add_test(CPUtraversal
+    CPUtraversal-test.exe)
+
+  add_test(CPUreduce
+    CPUreduce-test.exe)
+
+  add_test(CPUnested
+    CPUnested-test.exe)
     
   if(RAJA_ENABLE_OPENMP)
     add_executable(CPUnested_reduce-test.exe
       main-nested-reduce.cxx)
+
+    add_test(CPUnested-reduce
+      CPUnested_reduce-test.exe)
   endif()
 
 endif()
diff --git a/test/unit-tests/CPUtests/main-nested-reduce.cxx b/test/unit-tests/CPUtests/main-nested-reduce.cxx
index 6c7be0d30..3f352c8b0 100644
--- a/test/unit-tests/CPUtests/main-nested-reduce.cxx
+++ b/test/unit-tests/CPUtests/main-nested-reduce.cxx
@@ -83,5 +83,9 @@ int main(int argc, char *argv[])
    printf("\n All Tests : # passed / # run = %d / %d\n\n DONE!!!\n",
           passed, run) ;
 
-   return 0 ;
+   if (passed == run) {
+     return 0 ;
+   } else {
+     return 1 ;
+   }
 }
diff --git a/test/unit-tests/CPUtests/main-nested.cxx b/test/unit-tests/CPUtests/main-nested.cxx
index a0a8ba16b..10883c4ac 100644
--- a/test/unit-tests/CPUtests/main-nested.cxx
+++ b/test/unit-tests/CPUtests/main-nested.cxx
@@ -469,6 +469,10 @@ int main(int argc, char *argv[])
 
    cout << "\n DONE!!! " << endl;
 
-   return 0 ;
+   if (s_ntests_passed_total == s_ntests_run_total) {
+     return 0 ;
+   } else {
+     return 1 ;
+   }
 }
 
diff --git a/test/unit-tests/CPUtests/main-reduce.cxx b/test/unit-tests/CPUtests/main-reduce.cxx
index 964629850..d05a5bb91 100644
--- a/test/unit-tests/CPUtests/main-reduce.cxx
+++ b/test/unit-tests/CPUtests/main-reduce.cxx
@@ -901,5 +901,9 @@ int main(int argc, char* argv[]) {
 
   cout << "\n DONE!!! " << endl;
 
-  return 0;
+   if (s_ntests_passed_total == s_ntests_run_total) {
+     return 0 ;
+   } else {
+     return 1 ;
+   }
 }
diff --git a/test/unit-tests/CPUtests/main-traversal.cxx b/test/unit-tests/CPUtests/main-traversal.cxx
index 4167904a6..d159a3730 100644
--- a/test/unit-tests/CPUtests/main-traversal.cxx
+++ b/test/unit-tests/CPUtests/main-traversal.cxx
@@ -573,5 +573,12 @@ int main(int argc, char *argv[])
    cout << "\n DONE!!! " << endl;
 
    return 0 ;
+
+   if (s_ntests_passed_total == s_ntests_run_total) {
+     return 0 ;
+   } else {
+     return 1 ;
+   }
+
 }
 
-- 
GitLab


From cd5ec8fb231277f2318adf7692b742cfa1c9a4d8 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 28 Jun 2016 11:48:46 -0700
Subject: [PATCH 8/9] Remove old testing commands from Travis config

---
 .travis.yml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c0cffc027..75c4a469d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -71,12 +71,3 @@ script:
   - cmake ../ -DCMAKE_CXX_COMPILER="$COMPILER"
   - make -j
   - make test # currently does nothing, this is how testing should be done
-  - ./test/unit-tests/CPUtests/CPUnested-test.exe | tee test.out
-  - grep "All Tests" test.out | sed -e 's/.* \([0-9]\+\) \/ \([0-9]\+\)$/\1\t\2/' | awk '{print $1 "/" $2; exit !($1 == $2)}'
-  - ./test/unit-tests/CPUtests/CPUnested_reduce-test.exe | tee test.out
-  - grep "All Tests" test.out | sed -e 's/.* \([0-9]\+\) \/ \([0-9]\+\)$/\1\t\2/' | awk '{print $1 "/" $2; exit !($1 == $2)}'
-  - ./test/unit-tests/CPUtests/CPUreduce-test.exe | tee test.out
-  - grep "All Tests" test.out | sed -e 's/.* \([0-9]\+\) \/ \([0-9]\+\)$/\1\t\2/' | awk '{print $1 "/" $2; exit !($1 == $2)}'
-  - ./test/unit-tests/CPUtests/CPUtraversal-test.exe | tee test.out
-  - grep "All Tests" test.out | sed -e 's/.* \([0-9]\+\) \/ \([0-9]\+\)$/\1\t\2/' | awk '{print $1 "/" $2; exit !($1 == $2)}'
-
-- 
GitLab


From c6588d4f9e910c31073bbcf56adb080a730aa5c3 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 5 Jul 2016 17:27:50 -0700
Subject: [PATCH 9/9] Delete unnecessary return statement

---
 test/unit-tests/CPUtests/main-traversal.cxx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/unit-tests/CPUtests/main-traversal.cxx b/test/unit-tests/CPUtests/main-traversal.cxx
index d159a3730..a6b8d8ff9 100644
--- a/test/unit-tests/CPUtests/main-traversal.cxx
+++ b/test/unit-tests/CPUtests/main-traversal.cxx
@@ -572,8 +572,6 @@ int main(int argc, char *argv[])
 
    cout << "\n DONE!!! " << endl;
 
-   return 0 ;
-
    if (s_ntests_passed_total == s_ntests_run_total) {
      return 0 ;
    } else {
-- 
GitLab