diff --git a/driver/rnn_seq_driver.hpp b/driver/rnn_seq_driver.hpp
index afd8bf8b5b..dcfac06033 100644
--- a/driver/rnn_seq_driver.hpp
+++ b/driver/rnn_seq_driver.hpp
@@ -948,7 +948,7 @@ int RNNSeqDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
 
     // Unless seed is persistent between runs validation using cache stored in file is impossible.
-    srand(0);
+    prng::reset_seed();
 
     auto fill_array_via_gen = [](auto& dst, size_t dst_sz, double range_l, double range_r) {
         for(size_t it = 0; it < dst_sz; it++)
diff --git a/src/ocl/rnnocl.cpp b/src/ocl/rnnocl.cpp
index 131d69db46..b874441b44 100644
--- a/src/ocl/rnnocl.cpp
+++ b/src/ocl/rnnocl.cpp
@@ -570,6 +570,26 @@ void RNNDescriptor::RNNForwardTraining_MS(Handle& handle,
             const std::vector<size_t> hcy_dst_stride{
                 static_cast<size_t>(hidden_size * max_batch), static_cast<size_t>(hidden_size), 1};
 
+            if(in_n.at(0) < max_batch)
+            {
+                float beta = 0.;
+                const std::vector<size_t> zero_set_size{1,
+                                                        static_cast<size_t>(max_batch - in_n.at(0)),
+                                                        static_cast<size_t>(hidden_size)};
+                auto set_batch_offset = in_n.at(0) * hidden_size;
+
+                auto set_desc =
+                    miopen::TensorDescriptor(wDesc.GetType(), zero_set_size, hcy_dst_stride);
+                if(hy != nullptr)
+                {
+                    SetTensor(handle, set_desc, hy, &beta, hcy_layer_offset + set_batch_offset);
+                }
+                if(cy != nullptr)
+                {
+                    SetTensor(handle, set_desc, cy, &beta, hcy_layer_offset + set_batch_offset);
+                }
+            }
+
             for(int time_i = seq_len - 1; time_i >= 0; time_i--)
             {
                 auto copy_batch = (time_i == seq_len - 1) ? in_n.at(time_i)
@@ -2879,86 +2899,89 @@ void RNNDescriptor::RNNForwardTrainingPackedTensors(
             }
             else
             {
-                sp_size[1] = batch_n - in_n.at(0);
-                sp_size[2] = wei_len;
-                sp_desc    = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride);
-                w_size[1]  = 1;
-                w_size[2]  = wei_len;
-                w_desc     = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride);
+                if(batch_n - in_n.at(0) > 0)
+                {
+                    sp_size[1] = batch_n - in_n.at(0);
+                    sp_size[2] = wei_len;
+                    sp_desc    = miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride);
+                    w_size[1]  = 1;
+                    w_size[2]  = wei_len;
+                    w_desc     = miopen::TensorDescriptor(wDesc.GetType(), w_size, w_stride);
 
-                OpTensor(handle,
-                         miopenTensorOpAdd,
-                         &alpha0,
-                         sp_desc,
-                         reserveSpace,
-                         &alpha1,
-                         w_desc,
-                         w,
-                         &beta_t,
-                         sp_desc,
-                         reserveSpace,
-                         hid_shift + in_n.at(0) * hy_stride,
-                         wei_shift_bias_temp,
-                         hid_shift + in_n.at(0) * hy_stride,
-                         true);
-                // Update time
-                profileRNNkernels(handle, 1, ctime);
+                    OpTensor(handle,
+                             miopenTensorOpAdd,
+                             &alpha0,
+                             sp_desc,
+                             reserveSpace,
+                             &alpha1,
+                             w_desc,
+                             w,
+                             &beta_t,
+                             sp_desc,
+                             reserveSpace,
+                             hid_shift + in_n.at(0) * hy_stride,
+                             wei_shift_bias_temp,
+                             hid_shift + in_n.at(0) * hy_stride,
+                             true);
+                    // Update time
+                    profileRNNkernels(handle, 1, ctime);
 
-                if(dirMode != 0u)
-                {
-                    if(in_n.at(0) == in_n.at(seqLen - 1))
-                    {
-                        OpTensor(handle,
-                                 miopenTensorOpAdd,
-                                 &alpha0,
-                                 sp_desc,
-                                 reserveSpace,
-                                 &alpha1,
-                                 w_desc,
-                                 w,
-                                 &beta_t,
-                                 sp_desc,
-                                 reserveSpace,
-                                 hid_shift + wei_len,
-                                 wei_shift_bias_temp + wei_len,
-                                 hid_shift + wei_len,
-                                 true);
-                        // Update time
-                        profileRNNkernels(handle, 1, ctime);
-                    }
-                    else
+                    if(dirMode != 0u)
                     {
-                        int cur_batch = 0;
-                        for(int ti = 0; ti < seqLen; ti++)
+                        if(in_n.at(0) == in_n.at(seqLen - 1))
                         {
-                            if(ti != (seqLen - 1))
+                            OpTensor(handle,
+                                     miopenTensorOpAdd,
+                                     &alpha0,
+                                     sp_desc,
+                                     reserveSpace,
+                                     &alpha1,
+                                     w_desc,
+                                     w,
+                                     &beta_t,
+                                     sp_desc,
+                                     reserveSpace,
+                                     hid_shift + wei_len,
+                                     wei_shift_bias_temp + wei_len,
+                                     hid_shift + wei_len,
+                                     true);
+                            // Update time
+                            profileRNNkernels(handle, 1, ctime);
+                        }
+                        else
+                        {
+                            int cur_batch = 0;
+                            for(int ti = 0; ti < seqLen; ti++)
                             {
-                                offset = hid_shift + cur_batch * hy_stride;
+                                if(ti != (seqLen - 1))
+                                {
+                                    offset = hid_shift + cur_batch * hy_stride;
 
-                                sp_size[1] = in_n.at(ti + 1);
-                                sp_size[2] = wei_len;
-                                sp_desc =
-                                    miopen::TensorDescriptor(wDesc.GetType(), sp_size, sp_stride);
+                                    sp_size[1] = in_n.at(ti + 1);
+                                    sp_size[2] = wei_len;
+                                    sp_desc    = miopen::TensorDescriptor(
+                                        wDesc.GetType(), sp_size, sp_stride);
 
-                                OpTensor(handle,
-                                         miopenTensorOpAdd,
-                                         &alpha0,
-                                         sp_desc,
-                                         reserveSpace,
-                                         &alpha1,
-                                         w_desc,
-                                         w,
-                                         &beta_t,
-                                         sp_desc,
-                                         reserveSpace,
-                                         static_cast<int>(offset) + wei_len,
-                                         wei_shift_bias_temp + wei_len,
-                                         static_cast<int>(offset) + wei_len,
-                                         true);
-                                // Update time
-                                profileRNNkernels(handle, 1, ctime);
+                                    OpTensor(handle,
+                                             miopenTensorOpAdd,
+                                             &alpha0,
+                                             sp_desc,
+                                             reserveSpace,
+                                             &alpha1,
+                                             w_desc,
+                                             w,
+                                             &beta_t,
+                                             sp_desc,
+                                             reserveSpace,
+                                             static_cast<int>(offset) + wei_len,
+                                             wei_shift_bias_temp + wei_len,
+                                             static_cast<int>(offset) + wei_len,
+                                             true);
+                                    // Update time
+                                    profileRNNkernels(handle, 1, ctime);
+                                }
+                                cur_batch += in_n.at(ti);
                             }
-                            cur_batch += in_n.at(ti);
                         }
                     }
                 }
@@ -5374,18 +5397,17 @@ void RNNDescriptor::RNNBackwardDataPackedTensors(
     // dinput
     if(inputMode == miopenRNNskip)
     {
-        sp_size[1] = batch_n;
-        sp_size[2] = hy_h;
-        x_size[1]  = batch_n;
-        x_size[2]  = hy_h;
-        x_desc     = miopen::TensorDescriptor(rnn_data_type, x_size, x_stride);
-        sp_desc    = miopen::TensorDescriptor(rnn_data_type, sp_size, sp_stride);
+        const std::vector<int> dx_size{1, batch_n, hy_h};
+        x_desc  = miopen::TensorDescriptor(rnn_data_type, dx_size, x_stride);
+        sp_desc = miopen::TensorDescriptor(rnn_data_type, dx_size, sp_stride);
 
         alpha0 = 1;
         alpha1 = 1;
         beta_t = 0;
 
-        for(int gi = 0; gi < nHiddenTensorsPerLayer * bi; gi++)
+        CopyTensor(handle, sp_desc, workSpace, x_desc, dx, 0, 0, true);
+        profileRNNkernels(handle, 1, ctime);
+        for(int gi = 1; gi < nHiddenTensorsPerLayer * bi; gi++)
         {
             OpTensor(handle,
                      miopenTensorOpAdd,
diff --git a/src/rnn/rnn_util.cpp b/src/rnn/rnn_util.cpp
index 8761155de7..376c728347 100644
--- a/src/rnn/rnn_util.cpp
+++ b/src/rnn/rnn_util.cpp
@@ -317,6 +317,10 @@ void RNNTensorBaseLayoutConverter::ChangeTensorGPUDataPadding(
         const std::vector<size_t> packed_stride =
             get_packed_stride(copy_size, tensor_desc.GetLayoutVector());
 
+        // Nothing to copy, avoiding error with zero lens in TensorDescriptor
+        if(!std::all_of(copy_size.cbegin(), copy_size.cend(), [](size_t x) { return x > 0; }))
+            continue;
+
         const auto packed_desc =
             miopen::TensorDescriptor(tensor_desc.GetType(), copy_size, packed_stride);
         const auto padded_desc =
diff --git a/src/seq_tensor.cpp b/src/seq_tensor.cpp
index 7d7a9c32b5..e84e5c10dd 100644
--- a/src/seq_tensor.cpp
+++ b/src/seq_tensor.cpp
@@ -146,7 +146,7 @@ SeqTensorDescriptor::SeqTensorDescriptor(miopenDataType_t t,
     : SeqTensorDescriptor(t,
                           layout_in,
                           ConvertLengthsOrThrow(lens_in, "Lengths must be > 0"),
-                          ConvertLengthsOrThrow(seq_len, "SequenceLengths must be >= 0"),
+                          ConvertLengthsOrThrow(seq_len, "SequenceLengths must be >= 0", true),
                           {},
                           padding_marker_in,
                           use_seq_len,
@@ -429,22 +429,31 @@ std::vector<size_t> SeqTensorDescriptor::GetBatchesPerSequence() const
     }
     else
     {
+        batches.reserve(sequence_len[0]);
         auto block_begin = sequence_len.rbegin();
-        auto sample_ptr  = sequence_len.rbegin();
-        auto batch_size  = sequence_len.size();
 
-        batches.insert(batches.end(), *block_begin, batch_size);
+        while(block_begin != sequence_len.rend() && *block_begin == 0)
+            ++block_begin;
 
-        while(sample_ptr != sequence_len.rend())
+        if(block_begin != sequence_len.rend())
         {
-            if(*sample_ptr != *block_begin)
+            auto sample_ptr = block_begin;
+            auto batch_size = sequence_len.rend() - block_begin;
+
+            batches.insert(batches.end(), *block_begin, batch_size);
+
+            while(sample_ptr != sequence_len.rend())
             {
-                batch_size           = batch_size - (sample_ptr - block_begin);
-                const auto seq_count = *sample_ptr - *block_begin;
-                batches.insert(batches.end(), seq_count, batch_size);
-                block_begin = sample_ptr;
+                if(*sample_ptr != *block_begin)
+                {
+                    batch_size           = batch_size - (sample_ptr - block_begin);
+                    const auto seq_count = *sample_ptr - *block_begin;
+                    batches.insert(batches.end(), seq_count, batch_size);
+
+                    block_begin = sample_ptr;
+                }
+                sample_ptr++;
             }
-            sample_ptr++;
         }
     }
     return batches;
diff --git a/test/cpu_lstm.hpp b/test/cpu_lstm.hpp
deleted file mode 100644
index bbeef80012..0000000000
--- a/test/cpu_lstm.hpp
+++ /dev/null
@@ -1,1549 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#pragma once
-
-/**********************************************
- * LSTM CPU verification functions
- *
- **********************************************/
-
-template <class T>
-void LSTMFwdCPUVerify(miopen::Handle& handle,
-                      bool use_dropout,
-                      const miopen::DropoutDescriptor& dropoutDesc,
-                      const std::vector<T>& in,
-                      const std::vector<T>& wei, // [ input_state_weight_trans
-                                                 // hidden_state_weight0_trans input1_trans
-                                                 // hidden1_trans ... output_weight;
-                                                 // bidirectional reversed weights ]
-                      std::vector<T>& hy_host,   // current/final hidden state
-                      const std::vector<T>& hx,  // initial hidden state
-                      std::vector<T>& cy_host,   // current/final cell state
-                      const std::vector<T>& cx,  // initial cell state
-                      std::vector<T>& out_host,
-                      const std::vector<int>& in_n, // input batch size
-                      int in_h,                     // input data length
-                      int seqLength_cpu,            // Number of iterations to unroll over
-                      int bidirection,              // whether using bidirectional net
-                      int biased,                   // whether using bias
-                      int hy_d,  // 1 by numlayer (number of stacks of hidden layers) for
-                                 // unidirection, 2 by numlayer for bidirection
-                      int hy_n,  // equal to input batch size in_n[0]
-                      int hy_h,  // hidden state number
-                      int out_h, // 1 by hy_h related function for unidirection, 2 by hy_h
-                                 // related function for bidirection
-                      int inputMode_cpu,
-                      std::vector<T>& rsvspace,
-                      bool hx_is_null,
-                      bool cx_is_null)
-{
-    int batch_n_cpu = sumvc(in_n);
-
-    int numlayer = bidirection == 1 ? hy_d / 2 : hy_d;
-    int bi       = bidirection == 1 ? 2 : 1;
-
-    int in_stride  = in_h;
-    int out_stride = out_h;
-    int wei_stride = bi * 4 * hy_h;
-    int hy_stride  = bi * 6 * hy_h;
-    int h_stride   = bi * hy_h;
-    int uni_stride = hy_h;
-    int bi_stride  = hy_h * bi;
-
-    if(inputMode_cpu == 1)
-    {
-        if(in_h != hy_h)
-        {
-            std::cout
-                << "Verification cannot be completed: The input tensor size must equal to the "
-                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
-            return;
-        }
-        in_h = 0;
-    }
-
-    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
-
-    // initial dropoput
-    std::vector<prngStates> dropout_states_host;
-    std::vector<unsigned char> dropout_reservespace_host;
-    std::vector<T> dropout_hid_state;
-    miopenTensorDescriptor_t dropout_inputTensor{}, dropout_outputTensor{};
-    if(use_dropout)
-    {
-        size_t states_size  = dropoutDesc.stateSizeInBytes / sizeof(prngStates);
-        dropout_states_host = std::vector<prngStates>(states_size);
-        InitKernelStateEmulator(dropout_states_host, dropoutDesc);
-
-        std::array<int, 2> drop_in_len  = {{batch_n_cpu, hy_h * bi}};
-        std::array<int, 2> drop_in_str  = {{hy_stride, 1}};
-        std::array<int, 2> drop_out_str = {{hy_h * bi, 1}};
-        miopenCreateTensorDescriptor(&dropout_inputTensor);
-        miopenCreateTensorDescriptor(&dropout_outputTensor);
-        miopenSetTensorDescriptor(
-            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
-        miopenSetTensorDescriptor(
-            dropout_outputTensor, miopenFloat, 2, drop_in_len.data(), drop_out_str.data());
-
-        size_t reserveSpaceSizeInBytes = 0;
-        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
-        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
-        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
-                                                               static_cast<unsigned char>(1));
-
-        dropout_hid_state =
-            std::vector<T>((numlayer - 1) * batch_n_cpu * hy_h * bi, static_cast<T>(0));
-    }
-
-    // forward emulator
-    for(int li = 0; li < numlayer; li++)
-    {
-        int hid_shift = li * batch_n_cpu * hy_stride;
-        int hx_shift  = li * in_n.at(0) * h_stride;
-
-        // from input
-        if(li == 0)
-        {
-            if(inputMode_cpu == 1)
-            {
-                for(int bs = 0; bs < batch_n_cpu; bs++)
-                {
-                    for(int h = 0; h < hy_h; h++)
-                    {
-                        for(int gi = 0; gi < 4; gi++)
-                        {
-                            rsvspace.at(hid_shift + bs * hy_stride + gi * hy_h + h) +=
-                                in.at(bs * in_stride + h);
-                            if(bidirection == 1)
-                            {
-                                rsvspace.at(hid_shift + bs * hy_stride + (gi + 4) * hy_h + h) +=
-                                    in.at(bs * in_stride + h);
-                            }
-                        }
-                    }
-                }
-
-                // from bias
-                if(biased == 1)
-                {
-                    for(int bs = 0; bs < batch_n_cpu; bs++)
-                    {
-                        for(int h = 0; h < wei_stride; h++)
-                        {
-                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
-                                wei.at(wei_shift_bias + h);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                RNN_mm_cpu<T>(in.data(),
-                              in_h,
-                              batch_n_cpu,
-                              in_stride,
-                              0,
-                              wei.data(),
-                              in_h,
-                              hy_h * bi * 4,
-                              in_stride,
-                              RNN_MM_TRANSPOSE,
-                              &rsvspace[hid_shift],
-                              hy_h * bi * 4,
-                              batch_n_cpu,
-                              hy_stride,
-                              0,
-                              1,
-                              1);
-
-                // from bias
-                if(biased == 1)
-                {
-                    for(int bs = 0; bs < batch_n_cpu; bs++)
-                    {
-                        for(int h = 0; h < wei_stride; h++)
-                        {
-                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
-                                wei.at(wei_shift_bias + h);
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
-            int prelayer_shift = (li - 1) * batch_n_cpu * hy_stride + bi * 5 * hy_h;
-            if(use_dropout)
-            {
-                auto dropout_states_tmp = dropout_states_host;
-                size_t drop_out_offset  = (static_cast<size_t>(li) - 1) * batch_n_cpu * hy_h * bi;
-
-                DropoutForwardVerify<T>(handle,
-                                        dropoutDesc,
-                                        miopen::deref(dropout_inputTensor),
-                                        rsvspace,
-                                        miopen::deref(dropout_outputTensor),
-                                        dropout_hid_state,
-                                        dropout_reservespace_host,
-                                        dropout_states_tmp,
-                                        prelayer_shift,
-                                        drop_out_offset,
-                                        drop_out_offset);
-
-                prelayer_shift = drop_out_offset;
-            }
-
-            RNN_mm_cpu<T>(use_dropout ? &dropout_hid_state[prelayer_shift]
-                                      : &rsvspace[prelayer_shift],
-                          hy_h * bi,
-                          batch_n_cpu,
-                          use_dropout ? hy_h * bi : hy_stride,
-                          0,
-                          &wei[wei_shift],
-                          hy_h * bi,
-                          hy_h * bi * 4,
-                          bi_stride,
-                          RNN_MM_TRANSPOSE,
-                          &rsvspace[hid_shift],
-                          hy_h * bi * 4,
-                          batch_n_cpu,
-                          hy_stride,
-                          0,
-                          1,
-                          1);
-
-            // from bias
-            if(biased == 1)
-            {
-                int wei_shift_bias_temp = wei_shift_bias + li * 2 * wei_stride;
-
-                for(int bs = 0; bs < batch_n_cpu; bs++)
-                {
-                    for(int h = 0; h < wei_stride; h++)
-                    {
-                        rsvspace.at(hid_shift + bs * hy_stride + h) +=
-                            wei.at(wei_shift_bias_temp + h);
-                    }
-                }
-            }
-        }
-
-        // from hidden state
-        int bacc   = 0;
-        int baccbi = batch_n_cpu;
-        for(int ti = 0; ti < seqLength_cpu; ti++)
-        {
-            baccbi -= in_n.at(seqLength_cpu - 1 - ti);
-            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
-
-            if(ti == 0)
-            {
-                if(!hx_is_null)
-                {
-                    RNN_mm_cpu<T>(&hx[hx_shift],
-                                  hy_h,
-                                  in_n.at(ti),
-                                  uni_stride,
-                                  0,
-                                  &wei[wei_shift],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  RNN_MM_TRANSPOSE,
-                                  &rsvspace[hid_shift + bacc * hy_stride],
-                                  hy_h * 4,
-                                  in_n.at(ti),
-                                  hy_stride,
-                                  0,
-                                  1,
-                                  1);
-
-                    // from bias
-                    if(biased == 1)
-                    {
-                        int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
-
-                        for(int bs = 0; bs < in_n.at(ti); bs++)
-                        {
-                            for(int h = 0; h < 4 * hy_h; h++)
-                            {
-                                rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
-                                    wei.at(wei_shift_bias_temp + h);
-                            }
-                        }
-                    }
-
-                    if(bidirection == 1)
-                    {
-                        RNN_mm_cpu<T>(&hx[hx_shift + hy_n * hy_h],
-                                      hy_h,
-                                      in_n.at(seqLength_cpu - 1 - ti),
-                                      uni_stride,
-                                      0,
-                                      &wei[wei_shift + 4 * hy_h * uni_stride],
-                                      hy_h,
-                                      hy_h * 4,
-                                      uni_stride,
-                                      RNN_MM_TRANSPOSE,
-                                      &rsvspace[hid_shift + baccbi * hy_stride + 4 * hy_h],
-                                      hy_h * 4,
-                                      in_n.at(seqLength_cpu - 1 - ti),
-                                      hy_stride,
-                                      0,
-                                      1,
-                                      1);
-
-                        // from bias
-                        if(biased == 1)
-                        {
-                            int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
-
-                            for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
-                            {
-                                for(int h = 0; h < 4 * hy_h; h++)
-                                {
-                                    rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
-                                                bs * hy_stride + h) +=
-                                        wei.at(wei_shift_bias_temp + 4 * hy_h + h);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            else
-            {
-                RNN_mm_cpu<T>(&hy_host[hx_shift],
-                              hy_h,
-                              in_n.at(ti),
-                              uni_stride,
-                              0,
-                              &wei[wei_shift],
-                              hy_h,
-                              hy_h * 4,
-                              uni_stride,
-                              RNN_MM_TRANSPOSE,
-                              &rsvspace[hid_shift + bacc * hy_stride],
-                              hy_h * 4,
-                              in_n.at(ti),
-                              hy_stride,
-                              0,
-                              1,
-                              1);
-
-                // from bias
-                if(biased == 1)
-                {
-                    int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
-
-                    for(int bs = 0; bs < in_n.at(ti); bs++)
-                    {
-                        for(int h = 0; h < 4 * hy_h; h++)
-                        {
-                            rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
-                                wei.at(wei_shift_bias_temp + h);
-                        }
-                    }
-                }
-
-                if(bidirection == 1)
-                {
-
-                    if(!hx_is_null && in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti))
-                    {
-                        RNN_mm_cpu<T>(
-                            &hx[hx_shift + hy_n * hy_h + in_n.at(seqLength_cpu - ti) * hy_h],
-                            hy_h,
-                            (in_n.at(seqLength_cpu - 1 - ti) - in_n.at(seqLength_cpu - ti)),
-                            uni_stride,
-                            0,
-                            &wei[wei_shift + 4 * hy_h * uni_stride],
-                            hy_h,
-                            hy_h * 4,
-                            uni_stride,
-                            RNN_MM_TRANSPOSE,
-                            &rsvspace[hid_shift +
-                                      (baccbi + in_n.at(seqLength_cpu - ti)) * hy_stride +
-                                      4 * hy_h],
-                            hy_h * 4,
-                            (in_n.at(seqLength_cpu - 1 - ti) - in_n.at(seqLength_cpu - ti)),
-                            hy_stride,
-                            0,
-                            1,
-                            1);
-
-                        // from bias
-                        if(biased == 1)
-                        {
-                            int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
-
-                            for(int bs = in_n.at(seqLength_cpu - ti);
-                                bs < in_n.at(seqLength_cpu - 1 - ti);
-                                bs++)
-                            {
-                                for(int h = 0; h < 4 * hy_h; h++)
-                                {
-                                    rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
-                                                bs * hy_stride + h) +=
-                                        wei.at(wei_shift_bias_temp + 4 * hy_h + h);
-                                }
-                            }
-                        }
-                    }
-
-                    RNN_mm_cpu<T>(&hy_host[hx_shift + hy_n * hy_h],
-                                  hy_h,
-                                  in_n.at(seqLength_cpu - ti),
-                                  uni_stride,
-                                  0,
-                                  &wei[wei_shift + 4 * hy_h * uni_stride],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  RNN_MM_TRANSPOSE,
-                                  &rsvspace[hid_shift + baccbi * hy_stride + 4 * hy_h],
-                                  hy_h * 4,
-                                  in_n.at(seqLength_cpu - ti),
-                                  hy_stride,
-                                  0,
-                                  1,
-                                  1);
-
-                    // from bias
-                    if(biased == 1)
-                    {
-                        int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
-
-                        for(int bs = 0; bs < in_n.at(seqLength_cpu - ti); bs++)
-                        {
-                            for(int h = 0; h < 4 * hy_h; h++)
-                            {
-                                rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
-                                            bs * hy_stride + h) +=
-                                    wei.at(wei_shift_bias_temp + 4 * hy_h + h);
-                            }
-                        }
-                    }
-                }
-            }
-
-            for(int bs = 0; bs < in_n.at(ti); bs++)
-            {
-                for(int h = 0; h < hy_h; h++)
-                {
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2) *
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
-                                  1);
-                    if(ti == 0)
-                    {
-                        if(!cx_is_null)
-                        {
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                                activfunc(
-                                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h),
-                                    2) *
-                                cx.at(hx_shift + bs * uni_stride + h);
-                        }
-                    }
-                    else
-                    {
-                        int prec_shift = li * batch_n_cpu * hy_stride +
-                                         (bacc - in_n.at(ti - 1)) * hy_stride + bi * 4 * hy_h;
-
-                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                            activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h),
-                                      2) *
-                            rsvspace.at(prec_shift + bs * hy_stride + h);
-                    }
-
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
-                                  2) *
-                        activfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
-                            1);
-
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h +
-                                numlayer * batch_n_cpu * hy_stride) =
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2);
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h +
-                                numlayer * batch_n_cpu * hy_stride) =
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h +
-                                numlayer * batch_n_cpu * hy_stride) =
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
-                                  2);
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h +
-                                numlayer * batch_n_cpu * hy_stride) =
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
-                                  1);
-                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h +
-                                numlayer * batch_n_cpu * hy_stride) =
-                        activfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
-                            1);
-
-                    cy_host.at(hx_shift + bs * uni_stride + h) =
-                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h);
-                    hy_host.at(hx_shift + bs * uni_stride + h) =
-                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h);
-                }
-            }
-
-            if(bidirection == 1)
-            {
-                for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
-                {
-                    for(int h = 0; h < hy_h; h++)
-                    {
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
-                                    h) +=
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
-                                2) *
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
-                                1);
-                        if(ti == 0)
-                        {
-                            if(!cx_is_null)
-                            {
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                            hy_h + h) +=
-                                    activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                          5 * hy_h + h),
-                                              2) *
-                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
-                            }
-                        }
-                        else
-                        {
-
-                            if(!cx_is_null &&
-                               in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti))
-                            {
-                                if(bs >= in_n.at(seqLength_cpu - ti))
-                                {
-                                    rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                bi * 4 * hy_h + hy_h + h) +=
-                                        activfunc(rsvspace.at(hid_shift +
-                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
-                                                              h),
-                                                  2) *
-                                        cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
-                                }
-                            }
-
-                            if(bs < in_n.at(seqLength_cpu - ti))
-                            {
-                                int prec_shift =
-                                    li * batch_n_cpu * hy_stride +
-                                    (baccbi + in_n.at(seqLength_cpu - 1 - ti)) * hy_stride +
-                                    bi * 4 * hy_h + hy_h;
-
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                            hy_h + h) +=
-                                    activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                          5 * hy_h + h),
-                                              2) *
-                                    rsvspace.at(prec_shift + bs * hy_stride + h);
-                            }
-                        }
-
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h + hy_h +
-                                    h) +=
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
-                                2) *
-                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                  bi * 4 * hy_h + hy_h + h),
-                                      1);
-
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h +
-                                    numlayer * batch_n_cpu * hy_stride) =
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
-                                2);
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h +
-                                    numlayer * batch_n_cpu * hy_stride) =
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h),
-                                2);
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h +
-                                    numlayer * batch_n_cpu * hy_stride) =
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
-                                2);
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h +
-                                    numlayer * batch_n_cpu * hy_stride) =
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
-                                1);
-                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
-                                    h + numlayer * batch_n_cpu * hy_stride) =
-                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                  bi * 4 * hy_h + hy_h + h),
-                                      1);
-
-                        cy_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) = rsvspace.at(
-                            hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h + h);
-                        hy_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) = rsvspace.at(
-                            hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h + hy_h + h);
-                    }
-                }
-            }
-
-            bacc += in_n.at(ti);
-        }
-    }
-
-    // output
-    int prelayer_shift = (numlayer - 1) * batch_n_cpu * hy_stride + bi * 5 * hy_h;
-
-    for(int bs = 0; bs < batch_n_cpu; bs++)
-    {
-        for(int h = 0; h < out_h; h++)
-        {
-            out_host.at(bs * out_stride + h) = rsvspace.at(prelayer_shift + bs * hy_stride + h);
-        }
-    }
-
-    if(use_dropout)
-    {
-        for(int i = 0; i < (numlayer - 1) * batch_n_cpu * hy_h * bi; i++)
-        {
-            rsvspace.at(numlayer * batch_n_cpu * hy_stride * 2 + i) = dropout_hid_state.at(i);
-        }
-        auto p_drop_rsv = reinterpret_cast<unsigned char*>(&rsvspace.at(
-            numlayer * batch_n_cpu * hy_stride * 2 + (numlayer - 1) * batch_n_cpu * hy_h * bi));
-        for(int i = 0; i < (numlayer - 1) * batch_n_cpu * hy_h * bi; i++)
-        {
-            *(p_drop_rsv + i) = dropout_reservespace_host.at(i);
-        }
-    }
-}
-
-template <class T>
-void LSTMBwdDataCPUVerify(bool use_dropout_cpu,
-                          const miopen::DropoutDescriptor& dropoutDesc,
-                          std::vector<T>& din_host,
-                          const std::vector<T>& wei,     // [ input_state_weight_trans
-                                                         // hidden_state_weight0_trans input1_trans
-                                                         // hidden1_trans ... output_weight;
-                                                         // bidirectional reversed weights ]
-                          const std::vector<T>& dhy_cpu, // current/final hidden state
-                          std::vector<T>& dhx_host,
-                          const std::vector<T>& hx,      // initial hidden state
-                          const std::vector<T>& dcy_cpu, // current/final cell state
-                          std::vector<T>& dcx_host,
-                          const std::vector<T>& cx,
-                          const std::vector<T>& out,
-                          const std::vector<T>& dout,
-                          const std::vector<int>& in_n, // input batch size
-                          int in_h,                     // input data length
-                          int seqLength_cpu,            // Number of iterations to unroll over
-                          int bidirection,              // whether using bidirectional net
-                          int,                          // whether using bias
-                          int hy_d,  // 1 by numlayer (number of stacks of hidden layers)
-                                     // for unidirection, 2 by numlayer for bidirection
-                          int hy_n,  // equal to input batch size in_n[0]
-                          int hy_h,  // hidden state number
-                          int out_h, // 1 by hy_h related function for unidirection, 2 by
-                                     // hy_h related function for bidirection
-                          int inputMode_cpu,
-                          std::vector<T>& rsvspace,
-                          std::vector<T>& wkspace,
-                          bool cx_is_null,
-                          bool dhy_is_null,
-                          bool dcy_is_null)
-{
-    int batch_n_cpu = sumvc(in_n);
-    (void)out;
-    (void)hx;
-
-    int numlayer = bidirection == 1 ? hy_d / 2 : hy_d;
-    int bi       = bidirection == 1 ? 2 : 1;
-
-    int in_stride  = in_h;
-    int out_stride = out_h;
-    int wei_stride = bi * 4 * hy_h;
-    int hy_stride  = bi * 6 * hy_h;
-    int h_stride   = bi * hy_h;
-    int uni_stride = hy_h;
-    int bi_stride  = hy_h * bi;
-
-    if(inputMode_cpu == 1)
-    {
-        if(in_h != hy_h)
-        {
-            std::cout
-                << "Verification cannot be completed: The input tensor size must equal to the "
-                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
-            return;
-        }
-        in_h = 0;
-    }
-
-    // initial dropoput
-    miopenTensorDescriptor_t dropout_inputTensor{};
-    std::vector<unsigned char> dropout_reservespace_host;
-    if(use_dropout_cpu)
-    {
-        std::array<int, 2> drop_in_len = {{batch_n_cpu, hy_h * bi}};
-        std::array<int, 2> drop_in_str = {{hy_stride, 1}};
-        miopenCreateTensorDescriptor(&dropout_inputTensor);
-        miopenSetTensorDescriptor(
-            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
-
-        size_t reserveSpaceSizeInBytes = 0;
-        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
-        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
-        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
-                                                               static_cast<unsigned char>(0));
-
-        auto p_drop_rsv = reinterpret_cast<unsigned char*>(&rsvspace.at(
-            numlayer * batch_n_cpu * hy_stride * 2 + (numlayer - 1) * batch_n_cpu * hy_h * bi));
-        for(int i = 0; i < (numlayer - 1) * batch_n_cpu * hy_h * bi; i++)
-        {
-            dropout_reservespace_host.at(i) = *(p_drop_rsv + i);
-        }
-    }
-
-    // bwd data emulator
-    for(int li = numlayer - 1; li >= 0; li--)
-    {
-        int wei_shift = (in_h + hy_h) * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
-        int hid_shift = li * batch_n_cpu * hy_stride;
-        int hx_shift  = li * in_n.at(0) * h_stride;
-
-        if(li == numlayer - 1)
-        {
-            for(int bs = 0; bs < batch_n_cpu; bs++)
-            {
-                for(int h = 0; h < out_h; h++)
-                {
-                    wkspace.at(hid_shift + bi * 5 * hy_h + bs * hy_stride + h) +=
-                        dout.at(bs * out_stride + h);
-                }
-            }
-        }
-        else
-        {
-            int prelayer_shift = (li + 1) * batch_n_cpu * hy_stride;
-
-            RNN_mm_cpu<T>(&wkspace[prelayer_shift],
-                          hy_h * bi * 4,
-                          batch_n_cpu,
-                          hy_stride,
-                          0,
-                          &wei[wei_shift],
-                          hy_h * bi,
-                          hy_h * bi * 4,
-                          bi_stride,
-                          0,
-                          &wkspace[hid_shift + bi * 5 * hy_h],
-                          hy_h * bi,
-                          batch_n_cpu,
-                          hy_stride,
-                          0,
-                          1,
-                          1);
-
-            if(use_dropout_cpu)
-            {
-                DropoutBackwardVerify<T>(dropoutDesc,
-                                         miopen::deref(dropout_inputTensor),
-                                         wkspace,
-                                         miopen::deref(dropout_inputTensor),
-                                         wkspace,
-                                         dropout_reservespace_host,
-                                         hid_shift + bi * 5 * hy_h,
-                                         hid_shift + bi * 5 * hy_h,
-                                         li * batch_n_cpu * hy_h * bi);
-            }
-        }
-
-        // from hidden state
-        int bacc   = batch_n_cpu;
-        int baccbi = 0;
-        for(int ti = seqLength_cpu - 1; ti >= 0; ti--)
-        {
-            bacc -= in_n.at(ti);
-
-            if(ti == seqLength_cpu - 1)
-            {
-                for(int bs = 0; bs < in_n.at(ti); bs++)
-                {
-                    for(int h = 0; h < hy_h; h++)
-                    {
-                        if(!dhy_is_null)
-                        {
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
-                                dhy_cpu.at(hx_shift + bs * uni_stride + h);
-                        }
-                        if(!dcy_is_null)
-                        {
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                                dcy_cpu.at(hx_shift + bs * uni_stride + h);
-                        }
-                    }
-                }
-
-                if(bidirection == 1)
-                {
-                    for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
-                    {
-                        for(int h = 0; h < hy_h; h++)
-                        {
-                            if(!dhy_is_null)
-                            {
-                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
-                                           hy_h + h) +=
-                                    dhy_cpu.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
-                            }
-                            if(!dcy_is_null)
-                            {
-                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                           hy_h + h) +=
-                                    dcy_cpu.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
-                            }
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(!dhy_is_null && in_n.at(ti) > in_n.at(ti + 1))
-                {
-                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
-                    {
-                        for(int h = 0; h < hy_h; h++)
-                        {
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
-                                dhy_cpu.at(hx_shift + bs * uni_stride + h);
-                        }
-                    }
-                }
-
-                if(!dcy_is_null && in_n.at(ti) > in_n.at(ti + 1))
-                {
-                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
-                    {
-                        for(int h = 0; h < hy_h; h++)
-                        {
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                                dcy_cpu.at(hx_shift + bs * uni_stride + h);
-                        }
-                    }
-                }
-
-                int pretime_shift = li * batch_n_cpu * hy_stride + (bacc + in_n.at(ti)) * hy_stride;
-                int weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
-
-                RNN_mm_cpu<T>(&wkspace[pretime_shift],
-                              hy_h * 4,
-                              in_n.at(ti + 1),
-                              hy_stride,
-                              0,
-                              &wei[weitime_shift],
-                              hy_h,
-                              hy_h * 4,
-                              uni_stride,
-                              0,
-                              &wkspace[hid_shift + bacc * hy_stride + bi * 5 * hy_h],
-                              hy_h,
-                              in_n.at(ti + 1),
-                              hy_stride,
-                              0,
-                              1,
-                              1);
-
-                if(bidirection == 1)
-                {
-                    pretime_shift = li * batch_n_cpu * hy_stride +
-                                    (baccbi - in_n.at(seqLength_cpu - 2 - ti)) * hy_stride +
-                                    hy_h * 4;
-                    weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride +
-                                    hy_h * 4 * uni_stride;
-
-                    RNN_mm_cpu<T>(&wkspace[pretime_shift],
-                                  hy_h * 4,
-                                  in_n.at(seqLength_cpu - 1 - ti),
-                                  hy_stride,
-                                  0,
-                                  &wei[weitime_shift],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  0,
-                                  &wkspace[hid_shift + baccbi * hy_stride + bi * 5 * hy_h + hy_h],
-                                  hy_h,
-                                  in_n.at(seqLength_cpu - 1 - ti),
-                                  hy_stride,
-                                  0,
-                                  1,
-                                  1);
-                }
-            }
-
-            for(int bs = 0; bs < in_n.at(ti); bs++)
-            {
-                for(int h = 0; h < hy_h; h++)
-                {
-                    if(ti < seqLength_cpu - 1)
-                    {
-                        if(bs < in_n.at(ti + 1))
-                        {
-                            int pretime_shift =
-                                li * batch_n_cpu * hy_stride + (bacc + in_n.at(ti)) * hy_stride;
-
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                                wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
-                                activfunc(rsvspace.at(pretime_shift + bs * hy_stride + hy_h + h),
-                                          2);
-                        }
-                    }
-                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
-                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) *
-                        dervactivfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
-                            1) *
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
-                                  2);
-
-                    if(ti == 0)
-                    {
-                        if(!cx_is_null)
-                        {
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h) +=
-                                wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h +
-                                           h) *
-                                cx.at(hx_shift + bs * uni_stride + h) *
-                                dervactivfunc(
-                                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
-                        }
-                    }
-                    else
-                    {
-                        int pretime_shift =
-                            li * batch_n_cpu * hy_stride + (bacc - in_n.at(ti - 1)) * hy_stride;
-
-                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h) +=
-                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
-                            rsvspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
-                            dervactivfunc(
-                                rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
-                    }
-                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + h) +=
-                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
-                                  1) *
-                        dervactivfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2);
-                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h) +=
-                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) *
-                        activfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
-                            1) *
-                        dervactivfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h), 2);
-                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h) +=
-                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
-                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2) *
-                        dervactivfunc(
-                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h), 1);
-                }
-            }
-
-            if(bidirection == 1)
-            {
-                for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
-                {
-                    for(int h = 0; h < hy_h; h++)
-                    {
-                        if(ti < seqLength_cpu - 1)
-                        {
-                            int pretime_shift =
-                                li * batch_n_cpu * hy_stride +
-                                (baccbi - in_n.at(seqLength_cpu - 2 - ti)) * hy_stride;
-
-                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                       hy_h + h) +=
-                                wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + hy_h +
-                                           h) *
-                                activfunc(
-                                    rsvspace.at(pretime_shift + bs * hy_stride + 5 * hy_h + h), 2);
-                        }
-                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
-                                   h) +=
-                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
-                                       hy_h + h) *
-                            dervactivfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                      bi * 4 * hy_h + hy_h + h),
-                                          1) *
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
-                                2);
-
-                        if(ti == 0)
-                        {
-                            if(!cx_is_null)
-                            {
-                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
-                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                               bi * 4 * hy_h + hy_h + h) *
-                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) *
-                                    dervactivfunc(rsvspace.at(hid_shift +
-                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
-                                                              h),
-                                                  2);
-                            }
-                        }
-                        else
-                        {
-                            if(!cx_is_null &&
-                               in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti) &&
-                               bs >= in_n.at(seqLength_cpu - ti))
-                            {
-                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
-                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                               bi * 4 * hy_h + hy_h + h) *
-                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) *
-                                    dervactivfunc(rsvspace.at(hid_shift +
-                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
-                                                              h),
-                                                  2);
-                            }
-
-                            if(bs < in_n.at(seqLength_cpu - ti))
-                            {
-                                int pretime_shift =
-                                    li * batch_n_cpu * hy_stride +
-                                    (baccbi + in_n.at(seqLength_cpu - 1 - ti)) * hy_stride;
-
-                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
-                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                               bi * 4 * hy_h + hy_h + h) *
-                                    rsvspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h +
-                                                hy_h + h) *
-                                    dervactivfunc(rsvspace.at(hid_shift +
-                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
-                                                              h),
-                                                  2);
-                            }
-                        }
-                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h) +=
-                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                       hy_h + h) *
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
-                                1) *
-                            dervactivfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
-                                2);
-                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h) +=
-                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
-                                       hy_h + h) *
-                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
-                                                  bi * 4 * hy_h + hy_h + h),
-                                      1) *
-                            dervactivfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
-                                2);
-                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h) +=
-                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
-                                       hy_h + h) *
-                            activfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
-                                2) *
-                            dervactivfunc(
-                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
-                                1);
-                    }
-                }
-            }
-
-            baccbi += in_n.at(seqLength_cpu - 1 - ti);
-        }
-
-        // dcx, dhx
-        int pretime_shift = li * batch_n_cpu * hy_stride;
-        int weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
-
-        RNN_mm_cpu<T>(&wkspace[pretime_shift],
-                      hy_h * 4,
-                      in_n.at(0),
-                      hy_stride,
-                      0,
-                      &wei[weitime_shift],
-                      hy_h,
-                      hy_h * 4,
-                      uni_stride,
-                      0,
-                      &dhx_host[hx_shift],
-                      hy_h,
-                      in_n.at(0),
-                      uni_stride,
-                      0,
-                      1,
-                      1);
-
-        for(int bs = 0; bs < in_n.at(0); bs++)
-        {
-            for(int h = 0; h < hy_h; h++)
-            {
-                dcx_host.at(hx_shift + bs * uni_stride + h) +=
-                    wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
-                    activfunc(rsvspace.at(pretime_shift + bs * hy_stride + hy_h + h), 2);
-            }
-        }
-
-        if(bidirection == 1)
-        {
-            int ti = seqLength_cpu - 1, cur_bat = 0, pre_bat = batch_n_cpu;
-
-            while(ti >= 0)
-            {
-                pre_bat -= in_n.at(ti);
-                if(in_n.at(ti) > cur_bat)
-                {
-                    pretime_shift = li * batch_n_cpu * hy_stride + (pre_bat + cur_bat) * hy_stride;
-
-                    RNN_mm_cpu<T>(&wkspace[pretime_shift + 4 * hy_h],
-                                  hy_h * 4,
-                                  (in_n.at(ti) - cur_bat),
-                                  hy_stride,
-                                  0,
-                                  &wei[weitime_shift + 4 * hy_h * uni_stride],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  0,
-                                  &dhx_host[hx_shift + hy_n * hy_h + cur_bat * hy_h],
-                                  hy_h,
-                                  (in_n.at(ti) - cur_bat),
-                                  uni_stride,
-                                  0,
-                                  1,
-                                  1);
-
-                    for(int bs = cur_bat; bs < in_n.at(ti); bs++)
-                    {
-                        for(int h = 0; h < hy_h; h++)
-                        {
-                            dcx_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) +=
-                                wkspace.at(pretime_shift + (bs - cur_bat) * hy_stride +
-                                           bi * 4 * hy_h + hy_h + h) *
-                                activfunc(rsvspace.at(pretime_shift + (bs - cur_bat) * hy_stride +
-                                                      5 * hy_h + h),
-                                          2);
-                        }
-                    }
-                }
-                cur_bat = in_n.at(ti--);
-            }
-        }
-    }
-
-    // dinput
-    if(inputMode_cpu == 1)
-    {
-        for(int bs = 0; bs < batch_n_cpu; bs++)
-        {
-            for(int h = 0; h < hy_h; h++)
-            {
-                for(int gi = 0; gi < 4; gi++)
-                {
-                    din_host.at(bs * in_stride + h) += wkspace.at(bs * hy_stride + gi * hy_h + h);
-                    if(bidirection == 1)
-                    {
-                        din_host.at(bs * in_stride + h) +=
-                            wkspace.at(bs * hy_stride + (gi + 4) * hy_h + h);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        RNN_mm_cpu<T>(wkspace.data(),
-                      hy_h * bi * 4,
-                      batch_n_cpu,
-                      hy_stride,
-                      0,
-                      wei.data(),
-                      in_h,
-                      hy_h * bi * 4,
-                      in_stride,
-                      0,
-                      din_host.data(),
-                      in_h,
-                      batch_n_cpu,
-                      in_stride,
-                      0,
-                      1,
-                      1);
-    }
-}
-
-template <class T>
-void LSTMBwdWeightCPUVerify(bool use_dropout_cpu,
-                            const std::vector<T>& in,
-                            std::vector<T>& dwei_host, // [ input_state_weight_trans
-                                                       // hidden_state_weight0_trans
-                                                       // input1_trans hidden1_trans ...
-                                                       // output_weight; bidirectional
-                                                       // reversed weights ]
-                            const std::vector<T>& hx,  // initial hidden state
-                            const std::vector<T>& dout,
-                            const std::vector<int>& in_n, // input batch size
-                            int in_h,                     // input data length
-                            int seqLength_cpu,            // Number of iterations to unroll over
-                            int bidirection,              // whether using bidirectional net
-                            int biased,                   // whether using bias
-                            int hy_d,  // 1 by numlayer (number of stacks of hidden
-                                       // layers) for unidirection, 2 by numlayer for
-                                       // bidirection
-                            int hy_n,  // equal to input batch size in_n[0]
-                            int hy_h,  // hidden state number
-                            int out_h, // 1 by hy_h related function for unidirection, 2
-                                       // by hy_h related function for bidirection
-                            int inputMode_cpu,
-                            const std::vector<T>& rsvspace,
-                            const std::vector<T>& wkspace,
-                            bool hx_is_null)
-{
-    int batch_n_cpu = sumvc(in_n);
-    int numlayer    = bidirection == 1 ? hy_d / 2 : hy_d;
-    int bi          = bidirection == 1 ? 2 : 1;
-
-    int in_stride  = in_h;
-    int wei_stride = bi * 4 * hy_h;
-    int hy_stride  = bi * 6 * hy_h;
-    int h_stride   = bi * hy_h;
-    int uni_stride = hy_h;
-    int bi_stride  = hy_h * bi;
-    (void)dout;
-    (void)out_h;
-
-    if(inputMode_cpu == 1)
-    {
-        if(in_h != hy_h)
-        {
-            std::cout
-                << "Verification cannot be completed: The input tensor size must equal to the "
-                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
-            return;
-        }
-        in_h = 0;
-    }
-
-    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
-
-    // bwd weights emulator
-    for(int li = 0; li < numlayer; li++)
-    {
-        // between layers
-        if(li == 0)
-        {
-            if(inputMode_cpu != 1)
-            {
-                RNN_mm_cpu<T>(wkspace.data(),
-                              hy_h * bi * 4,
-                              batch_n_cpu,
-                              hy_stride,
-                              RNN_MM_TRANSPOSE,
-                              in.data(),
-                              in_h,
-                              batch_n_cpu,
-                              in_stride,
-                              0,
-                              dwei_host.data(),
-                              in_h,
-                              hy_h * bi * 4,
-                              in_stride,
-                              0,
-                              1,
-                              1);
-            }
-
-            if(biased == 1)
-            {
-                for(int h = 0; h < wei_stride; h++)
-                {
-                    for(int w = 0; w < batch_n_cpu; w++)
-                    {
-                        dwei_host.at(wei_shift_bias + h) += wkspace.at(w * hy_stride + h);
-                    }
-                }
-            }
-        }
-        else
-        {
-            int prelayer_shift =
-                use_dropout_cpu
-                    ? 2 * numlayer * batch_n_cpu * hy_stride + (li - 1) * batch_n_cpu * hy_h * bi
-                    : (li - 1) * batch_n_cpu * hy_stride + bi * hy_h * 5;
-            int hid_shift = li * batch_n_cpu * hy_stride;
-            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
-
-            RNN_mm_cpu<T>(&wkspace[hid_shift],
-                          hy_h * bi * 4,
-                          batch_n_cpu,
-                          hy_stride,
-                          RNN_MM_TRANSPOSE,
-                          &rsvspace[prelayer_shift],
-                          hy_h * bi,
-                          batch_n_cpu,
-                          use_dropout_cpu ? hy_h * bi : hy_stride,
-                          0,
-                          &dwei_host[wei_shift],
-                          hy_h * bi,
-                          hy_h * bi * 4,
-                          bi_stride,
-                          0,
-                          1,
-                          1);
-
-            if(biased == 1)
-            {
-                wei_shift = wei_shift_bias + li * 2 * wei_stride;
-
-                for(int h = 0; h < wei_stride; h++)
-                {
-                    for(int w = 0; w < batch_n_cpu; w++)
-                    {
-                        dwei_host.at(wei_shift + h) += wkspace.at(hid_shift + w * hy_stride + h);
-                    }
-                }
-            }
-        }
-
-        // between time
-        int bacc = 0;
-        for(int ti = 0; ti < seqLength_cpu; ti++)
-        {
-            int hid_shift = li * batch_n_cpu * hy_stride + bacc * hy_stride;
-            int hx_shift  = li * in_n.at(0) * h_stride;
-            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
-            int pretime_shift;
-
-            // between time
-            if(ti == 0)
-            {
-                if(!hx_is_null)
-                {
-                    RNN_mm_cpu<T>(&wkspace[hid_shift],
-                                  hy_h * 4,
-                                  in_n.at(ti),
-                                  hy_stride,
-                                  RNN_MM_TRANSPOSE,
-                                  &hx[hx_shift],
-                                  hy_h,
-                                  in_n.at(ti),
-                                  uni_stride,
-                                  0,
-                                  &dwei_host[wei_shift],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  0,
-                                  1,
-                                  1);
-
-                    if(biased == 1)
-                    {
-                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
-
-                        for(int h = 0; h < hy_h * 4; h++)
-                        {
-                            for(int w = 0; w < in_n.at(ti); w++)
-                            {
-                                dwei_host.at(bias_shift + h) +=
-                                    wkspace.at(hid_shift + w * hy_stride + h);
-                            }
-                        }
-                    }
-                }
-            }
-            else
-            {
-                pretime_shift = li * batch_n_cpu * hy_stride +
-                                (bacc - in_n.at(ti - 1)) * hy_stride + bi * 5 * hy_h;
-
-                RNN_mm_cpu<T>(&wkspace[hid_shift],
-                              hy_h * 4,
-                              in_n.at(ti),
-                              hy_stride,
-                              RNN_MM_TRANSPOSE,
-                              &rsvspace[pretime_shift],
-                              hy_h,
-                              in_n.at(ti),
-                              hy_stride,
-                              0,
-                              &dwei_host[wei_shift],
-                              hy_h,
-                              hy_h * 4,
-                              uni_stride,
-                              0,
-                              1,
-                              1);
-
-                if(biased == 1)
-                {
-                    int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
-
-                    for(int h = 0; h < hy_h * 4; h++)
-                    {
-                        for(int w = 0; w < in_n.at(ti); w++)
-                        {
-                            dwei_host.at(bias_shift + h) +=
-                                wkspace.at(hid_shift + w * hy_stride + h);
-                        }
-                    }
-                }
-            }
-
-            if(bidirection == 1)
-            {
-                if(ti == seqLength_cpu - 1)
-                {
-                    if(!hx_is_null)
-                    {
-                        RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h],
-                                      hy_h * 4,
-                                      in_n.at(ti),
-                                      hy_stride,
-                                      RNN_MM_TRANSPOSE,
-                                      &hx[hx_shift + hy_n * hy_h],
-                                      hy_h,
-                                      in_n.at(ti),
-                                      uni_stride,
-                                      0,
-                                      &dwei_host[wei_shift + 4 * hy_h * uni_stride],
-                                      hy_h,
-                                      hy_h * 4,
-                                      uni_stride,
-                                      0,
-                                      1,
-                                      1);
-
-                        if(biased == 1)
-                        {
-                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
-
-                            for(int h = 0; h < hy_h * 4; h++)
-                            {
-                                for(int w = 0; w < in_n.at(ti); w++)
-                                {
-                                    dwei_host.at(bias_shift + hy_h * 4 + h) +=
-                                        wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(!hx_is_null && in_n.at(ti) > in_n.at(ti + 1))
-                    {
-                        RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h + in_n.at(ti + 1) * hy_stride],
-                                      hy_h * 4,
-                                      (in_n.at(ti) - in_n.at(ti + 1)),
-                                      hy_stride,
-                                      RNN_MM_TRANSPOSE,
-                                      &hx[hx_shift + hy_n * hy_h + in_n.at(ti + 1) * hy_h],
-                                      hy_h,
-                                      (in_n.at(ti) - in_n.at(ti + 1)),
-                                      uni_stride,
-                                      0,
-                                      &dwei_host[wei_shift + 4 * hy_h * uni_stride],
-                                      hy_h,
-                                      hy_h * 4,
-                                      uni_stride,
-                                      0,
-                                      1,
-                                      1);
-
-                        if(biased == 1)
-                        {
-                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
-
-                            for(int h = 0; h < hy_h * 4; h++)
-                            {
-                                for(int w = in_n.at(ti + 1); w < in_n.at(ti); w++)
-                                {
-                                    dwei_host.at(bias_shift + hy_h * 4 + h) +=
-                                        wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
-                                }
-                            }
-                        }
-                    }
-
-                    pretime_shift = li * batch_n_cpu * hy_stride +
-                                    (bacc + in_n.at(ti)) * hy_stride + bi * 5 * hy_h;
-
-                    RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h],
-                                  hy_h * 4,
-                                  in_n.at(ti + 1),
-                                  hy_stride,
-                                  RNN_MM_TRANSPOSE,
-                                  &rsvspace[pretime_shift + hy_h],
-                                  hy_h,
-                                  in_n.at(ti + 1),
-                                  hy_stride,
-                                  0,
-                                  &dwei_host[wei_shift + 4 * hy_h * uni_stride],
-                                  hy_h,
-                                  hy_h * 4,
-                                  uni_stride,
-                                  0,
-                                  1,
-                                  1);
-
-                    if(biased == 1)
-                    {
-                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
-
-                        for(int h = 0; h < hy_h * 4; h++)
-                        {
-                            for(int w = 0; w < in_n.at(ti + 1); w++)
-                            {
-                                dwei_host.at(bias_shift + hy_h * 4 + h) +=
-                                    wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
-                            }
-                        }
-                    }
-                }
-            }
-
-            bacc += in_n.at(ti);
-        }
-    }
-}
-//////=========END CPU VERIFICATION FUNCTIONS=============
diff --git a/test/cpu_rnn.hpp b/test/cpu_rnn.hpp
new file mode 100644
index 0000000000..b879bd27fb
--- /dev/null
+++ b/test/cpu_rnn.hpp
@@ -0,0 +1,4913 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+/**********************************************
+ * LSTM CPU verification functions
+ **********************************************/
+
+template <class T>
+void LSTMFwdCPUVerify(miopen::Handle& handle,
+                      bool use_dropout,
+                      const miopen::DropoutDescriptor& dropoutDesc,
+                      const std::vector<T>& in,
+                      const std::vector<T>& wei, // [ input_state_weight_trans
+                                                 // hidden_state_weight0_trans input1_trans
+                                                 // hidden1_trans ... output_weight;
+                                                 // bidirectional reversed weights ]
+                      std::vector<T>& hy_host,   // current/final hidden state
+                      const std::vector<T>& hx,  // initial hidden state
+                      std::vector<T>& cy_host,   // current/final cell state
+                      const std::vector<T>& cx,  // initial cell state
+                      std::vector<T>& out_host,
+                      const std::vector<int>& in_n, // input batch size
+                      int in_h,                     // input data length
+                      int seqLength_cpu,            // Number of iterations to unroll over
+                      int bidirection,              // whether using bidirectional net
+                      int biased,                   // whether using bias
+                      int hy_d,  // 1 by numlayer (number of stacks of hidden layers) for
+                                 // unidirection, 2 by numlayer for bidirection
+                      int hy_n,  // equal to input batch size in_n[0]
+                      int hy_h,  // hidden state number
+                      int out_h, // 1 by hy_h related function for unidirection, 2 by hy_h
+                                 // related function for bidirection
+                      int inputMode_cpu,
+                      std::vector<T>& rsvspace,
+                      bool hx_is_null,
+                      bool cx_is_null)
+{
+    int batch_n_cpu = sumvc(in_n);
+
+    int numlayer = bidirection == 1 ? hy_d / 2 : hy_d;
+    int bi       = bidirection == 1 ? 2 : 1;
+
+    int in_stride  = in_h;
+    int out_stride = out_h;
+    int wei_stride = bi * 4 * hy_h;
+    int hy_stride  = bi * 6 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode_cpu == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
+
+    // initial dropoput
+    std::vector<prngStates> dropout_states_host;
+    std::vector<unsigned char> dropout_reservespace_host;
+    std::vector<T> dropout_hid_state;
+    miopenTensorDescriptor_t dropout_inputTensor{}, dropout_outputTensor{};
+    if(use_dropout)
+    {
+        size_t states_size  = dropoutDesc.stateSizeInBytes / sizeof(prngStates);
+        dropout_states_host = std::vector<prngStates>(states_size);
+        InitKernelStateEmulator(dropout_states_host, dropoutDesc);
+
+        std::array<int, 2> drop_in_len  = {{batch_n_cpu, hy_h * bi}};
+        std::array<int, 2> drop_in_str  = {{hy_stride, 1}};
+        std::array<int, 2> drop_out_str = {{hy_h * bi, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenCreateTensorDescriptor(&dropout_outputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+        miopenSetTensorDescriptor(
+            dropout_outputTensor, miopenFloat, 2, drop_in_len.data(), drop_out_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(1));
+
+        dropout_hid_state =
+            std::vector<T>((numlayer - 1) * batch_n_cpu * hy_h * bi, static_cast<T>(0));
+    }
+
+    // forward emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        int hid_shift = li * batch_n_cpu * hy_stride;
+        int hx_shift  = li * in_n.at(0) * h_stride;
+
+        // from input
+        if(li == 0)
+        {
+            if(inputMode_cpu == 1)
+            {
+                for(int bs = 0; bs < batch_n_cpu; bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        for(int gi = 0; gi < 4; gi++)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + gi * hy_h + h) +=
+                                in.at(bs * in_stride + h);
+                            if(bidirection == 1)
+                            {
+                                rsvspace.at(hid_shift + bs * hy_stride + (gi + 4) * hy_h + h) +=
+                                    in.at(bs * in_stride + h);
+                            }
+                        }
+                    }
+                }
+
+                // from bias
+                if(biased == 1)
+                {
+                    for(int bs = 0; bs < batch_n_cpu; bs++)
+                    {
+                        for(int h = 0; h < wei_stride; h++)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias + h);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                RNN_mm_cpu<T>(in.data(),
+                              in_h,
+                              batch_n_cpu,
+                              in_stride,
+                              0,
+                              wei.data(),
+                              in_h,
+                              hy_h * bi * 4,
+                              in_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[hid_shift],
+                              hy_h * bi * 4,
+                              batch_n_cpu,
+                              hy_stride,
+                              0,
+                              1,
+                              1);
+
+                // from bias
+                if(biased == 1)
+                {
+                    for(int bs = 0; bs < batch_n_cpu; bs++)
+                    {
+                        for(int h = 0; h < wei_stride; h++)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias + h);
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
+            int prelayer_shift = (li - 1) * batch_n_cpu * hy_stride + bi * 5 * hy_h;
+            if(use_dropout)
+            {
+                auto dropout_states_tmp = dropout_states_host;
+                size_t drop_out_offset  = (static_cast<size_t>(li) - 1) * batch_n_cpu * hy_h * bi;
+
+                DropoutForwardVerify<T>(handle,
+                                        dropoutDesc,
+                                        miopen::deref(dropout_inputTensor),
+                                        rsvspace,
+                                        miopen::deref(dropout_outputTensor),
+                                        dropout_hid_state,
+                                        dropout_reservespace_host,
+                                        dropout_states_tmp,
+                                        prelayer_shift,
+                                        drop_out_offset,
+                                        drop_out_offset);
+
+                prelayer_shift = drop_out_offset;
+            }
+
+            RNN_mm_cpu<T>(use_dropout ? &dropout_hid_state[prelayer_shift]
+                                      : &rsvspace[prelayer_shift],
+                          hy_h * bi,
+                          batch_n_cpu,
+                          use_dropout ? hy_h * bi : hy_stride,
+                          0,
+                          &wei[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi * 4,
+                          bi_stride,
+                          RNN_MM_TRANSPOSE,
+                          &rsvspace[hid_shift],
+                          hy_h * bi * 4,
+                          batch_n_cpu,
+                          hy_stride,
+                          0,
+                          1,
+                          1);
+
+            // from bias
+            if(biased == 1)
+            {
+                int wei_shift_bias_temp = wei_shift_bias + li * 2 * wei_stride;
+
+                for(int bs = 0; bs < batch_n_cpu; bs++)
+                {
+                    for(int h = 0; h < wei_stride; h++)
+                    {
+                        rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                            wei.at(wei_shift_bias_temp + h);
+                    }
+                }
+            }
+        }
+
+        // from hidden state
+        int bacc   = 0;
+        int baccbi = batch_n_cpu;
+        for(int ti = 0; ti < seqLength_cpu; ti++)
+        {
+            baccbi -= in_n.at(seqLength_cpu - 1 - ti);
+            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu<T>(&hx[hx_shift],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  uni_stride,
+                                  0,
+                                  &wei[wei_shift],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[hid_shift + bacc * hy_stride],
+                                  hy_h * 4,
+                                  in_n.at(ti),
+                                  hy_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    // from bias
+                    if(biased == 1)
+                    {
+                        int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
+
+                        for(int bs = 0; bs < in_n.at(ti); bs++)
+                        {
+                            for(int h = 0; h < 4 * hy_h; h++)
+                            {
+                                rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                    wei.at(wei_shift_bias_temp + h);
+                            }
+                        }
+                    }
+
+                    if(bidirection == 1)
+                    {
+                        RNN_mm_cpu<T>(&hx[hx_shift + hy_n * hy_h],
+                                      hy_h,
+                                      in_n.at(seqLength_cpu - 1 - ti),
+                                      uni_stride,
+                                      0,
+                                      &wei[wei_shift + 4 * hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h * 4,
+                                      uni_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &rsvspace[hid_shift + baccbi * hy_stride + 4 * hy_h],
+                                      hy_h * 4,
+                                      in_n.at(seqLength_cpu - 1 - ti),
+                                      hy_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        // from bias
+                        if(biased == 1)
+                        {
+                            int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
+
+                            for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
+                            {
+                                for(int h = 0; h < 4 * hy_h; h++)
+                                {
+                                    rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
+                                                bs * hy_stride + h) +=
+                                        wei.at(wei_shift_bias_temp + 4 * hy_h + h);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                RNN_mm_cpu<T>(&hy_host[hx_shift],
+                              hy_h,
+                              in_n.at(ti),
+                              uni_stride,
+                              0,
+                              &wei[wei_shift],
+                              hy_h,
+                              hy_h * 4,
+                              uni_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[hid_shift + bacc * hy_stride],
+                              hy_h * 4,
+                              in_n.at(ti),
+                              hy_stride,
+                              0,
+                              1,
+                              1);
+
+                // from bias
+                if(biased == 1)
+                {
+                    int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
+
+                    for(int bs = 0; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < 4 * hy_h; h++)
+                        {
+                            rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias_temp + h);
+                        }
+                    }
+                }
+
+                if(bidirection == 1)
+                {
+
+                    if(!hx_is_null && in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti))
+                    {
+                        RNN_mm_cpu<T>(
+                            &hx[hx_shift + hy_n * hy_h + in_n.at(seqLength_cpu - ti) * hy_h],
+                            hy_h,
+                            (in_n.at(seqLength_cpu - 1 - ti) - in_n.at(seqLength_cpu - ti)),
+                            uni_stride,
+                            0,
+                            &wei[wei_shift + 4 * hy_h * uni_stride],
+                            hy_h,
+                            hy_h * 4,
+                            uni_stride,
+                            RNN_MM_TRANSPOSE,
+                            &rsvspace[hid_shift +
+                                      (baccbi + in_n.at(seqLength_cpu - ti)) * hy_stride +
+                                      4 * hy_h],
+                            hy_h * 4,
+                            (in_n.at(seqLength_cpu - 1 - ti) - in_n.at(seqLength_cpu - ti)),
+                            hy_stride,
+                            0,
+                            1,
+                            1);
+
+                        // from bias
+                        if(biased == 1)
+                        {
+                            int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
+
+                            for(int bs = in_n.at(seqLength_cpu - ti);
+                                bs < in_n.at(seqLength_cpu - 1 - ti);
+                                bs++)
+                            {
+                                for(int h = 0; h < 4 * hy_h; h++)
+                                {
+                                    rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
+                                                bs * hy_stride + h) +=
+                                        wei.at(wei_shift_bias_temp + 4 * hy_h + h);
+                                }
+                            }
+                        }
+                    }
+
+                    RNN_mm_cpu<T>(&hy_host[hx_shift + hy_n * hy_h],
+                                  hy_h,
+                                  in_n.at(seqLength_cpu - ti),
+                                  uni_stride,
+                                  0,
+                                  &wei[wei_shift + 4 * hy_h * uni_stride],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[hid_shift + baccbi * hy_stride + 4 * hy_h],
+                                  hy_h * 4,
+                                  in_n.at(seqLength_cpu - ti),
+                                  hy_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    // from bias
+                    if(biased == 1)
+                    {
+                        int wei_shift_bias_temp = wei_shift_bias + (li * 2 + 1) * wei_stride;
+
+                        for(int bs = 0; bs < in_n.at(seqLength_cpu - ti); bs++)
+                        {
+                            for(int h = 0; h < 4 * hy_h; h++)
+                            {
+                                rsvspace.at(hid_shift + baccbi * hy_stride + 4 * hy_h +
+                                            bs * hy_stride + h) +=
+                                    wei.at(wei_shift_bias_temp + 4 * hy_h + h);
+                            }
+                        }
+                    }
+                }
+            }
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2) *
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
+                                  1);
+                    if(ti == 0)
+                    {
+                        if(!cx_is_null)
+                        {
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                                activfunc(
+                                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h),
+                                    2) *
+                                cx.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                    else
+                    {
+                        int prec_shift = li * batch_n_cpu * hy_stride +
+                                         (bacc - in_n.at(ti - 1)) * hy_stride + bi * 4 * hy_h;
+
+                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                            activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h),
+                                      2) *
+                            rsvspace.at(prec_shift + bs * hy_stride + h);
+                    }
+
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
+                                  2) *
+                        activfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
+                            1);
+
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h +
+                                numlayer * batch_n_cpu * hy_stride) =
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2);
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h +
+                                numlayer * batch_n_cpu * hy_stride) =
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h +
+                                numlayer * batch_n_cpu * hy_stride) =
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
+                                  2);
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h +
+                                numlayer * batch_n_cpu * hy_stride) =
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
+                                  1);
+                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h +
+                                numlayer * batch_n_cpu * hy_stride) =
+                        activfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
+                            1);
+
+                    cy_host.at(hx_shift + bs * uni_stride + h) =
+                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h);
+                    hy_host.at(hx_shift + bs * uni_stride + h) =
+                        rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h);
+                }
+            }
+
+            if(bidirection == 1)
+            {
+                for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
+                                    h) +=
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
+                                2) *
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
+                                1);
+                        if(ti == 0)
+                        {
+                            if(!cx_is_null)
+                            {
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                            hy_h + h) +=
+                                    activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                          5 * hy_h + h),
+                                              2) *
+                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
+                            }
+                        }
+                        else
+                        {
+
+                            if(!cx_is_null &&
+                               in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti))
+                            {
+                                if(bs >= in_n.at(seqLength_cpu - ti))
+                                {
+                                    rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                bi * 4 * hy_h + hy_h + h) +=
+                                        activfunc(rsvspace.at(hid_shift +
+                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
+                                                              h),
+                                                  2) *
+                                        cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
+                                }
+                            }
+
+                            if(bs < in_n.at(seqLength_cpu - ti))
+                            {
+                                int prec_shift =
+                                    li * batch_n_cpu * hy_stride +
+                                    (baccbi + in_n.at(seqLength_cpu - 1 - ti)) * hy_stride +
+                                    bi * 4 * hy_h + hy_h;
+
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                            hy_h + h) +=
+                                    activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                          5 * hy_h + h),
+                                              2) *
+                                    rsvspace.at(prec_shift + bs * hy_stride + h);
+                            }
+                        }
+
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h + hy_h +
+                                    h) +=
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
+                                2) *
+                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                  bi * 4 * hy_h + hy_h + h),
+                                      1);
+
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h +
+                                    numlayer * batch_n_cpu * hy_stride) =
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
+                                2);
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h +
+                                    numlayer * batch_n_cpu * hy_stride) =
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h),
+                                2);
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h +
+                                    numlayer * batch_n_cpu * hy_stride) =
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
+                                2);
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h +
+                                    numlayer * batch_n_cpu * hy_stride) =
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
+                                1);
+                        rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
+                                    h + numlayer * batch_n_cpu * hy_stride) =
+                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                  bi * 4 * hy_h + hy_h + h),
+                                      1);
+
+                        cy_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) = rsvspace.at(
+                            hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h + h);
+                        hy_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) = rsvspace.at(
+                            hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h + hy_h + h);
+                    }
+                }
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+
+    // output
+    int prelayer_shift = (numlayer - 1) * batch_n_cpu * hy_stride + bi * 5 * hy_h;
+
+    for(int bs = 0; bs < batch_n_cpu; bs++)
+    {
+        for(int h = 0; h < out_h; h++)
+        {
+            out_host.at(bs * out_stride + h) = rsvspace.at(prelayer_shift + bs * hy_stride + h);
+        }
+    }
+
+    if(use_dropout)
+    {
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n_cpu * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n_cpu * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                rsvspace.at(dropout_offset + i) = dropout_hid_state.at(i);
+            }
+
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                *(p_drop_rsv + i) = dropout_reservespace_host.at(i);
+            }
+        }
+    }
+}
+
+template <class T>
+void LSTMBwdDataCPUVerify(bool use_dropout_cpu,
+                          const miopen::DropoutDescriptor& dropoutDesc,
+                          std::vector<T>& din_host,
+                          const std::vector<T>& wei,     // [ input_state_weight_trans
+                                                         // hidden_state_weight0_trans input1_trans
+                                                         // hidden1_trans ... output_weight;
+                                                         // bidirectional reversed weights ]
+                          const std::vector<T>& dhy_cpu, // current/final hidden state
+                          std::vector<T>& dhx_host,
+                          const std::vector<T>& hx,      // initial hidden state
+                          const std::vector<T>& dcy_cpu, // current/final cell state
+                          std::vector<T>& dcx_host,
+                          const std::vector<T>& cx,
+                          const std::vector<T>& out,
+                          const std::vector<T>& dout,
+                          const std::vector<int>& in_n, // input batch size
+                          int in_h,                     // input data length
+                          int seqLength_cpu,            // Number of iterations to unroll over
+                          int bidirection,              // whether using bidirectional net
+                          int,                          // whether using bias
+                          int hy_d,  // 1 by numlayer (number of stacks of hidden layers)
+                                     // for unidirection, 2 by numlayer for bidirection
+                          int hy_n,  // equal to input batch size in_n[0]
+                          int hy_h,  // hidden state number
+                          int out_h, // 1 by hy_h related function for unidirection, 2 by
+                                     // hy_h related function for bidirection
+                          int inputMode_cpu,
+                          std::vector<T>& rsvspace,
+                          std::vector<T>& wkspace,
+                          bool cx_is_null,
+                          bool dhy_is_null,
+                          bool dcy_is_null)
+{
+    int batch_n_cpu = sumvc(in_n);
+    (void)out;
+    (void)hx;
+
+    int numlayer = bidirection == 1 ? hy_d / 2 : hy_d;
+    int bi       = bidirection == 1 ? 2 : 1;
+
+    int in_stride  = in_h;
+    int out_stride = out_h;
+    int wei_stride = bi * 4 * hy_h;
+    int hy_stride  = bi * 6 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode_cpu == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    // initial dropoput
+    miopenTensorDescriptor_t dropout_inputTensor{};
+    std::vector<unsigned char> dropout_reservespace_host;
+    if(use_dropout_cpu)
+    {
+        std::array<int, 2> drop_in_len = {{batch_n_cpu, hy_h * bi}};
+        std::array<int, 2> drop_in_str = {{hy_stride, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(0));
+
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n_cpu * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n_cpu * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                dropout_reservespace_host.at(i) = *(p_drop_rsv + i);
+            }
+        }
+    }
+
+    // bwd data emulator
+    for(int li = numlayer - 1; li >= 0; li--)
+    {
+        int wei_shift = (in_h + hy_h) * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+        int hid_shift = li * batch_n_cpu * hy_stride;
+        int hx_shift  = li * in_n.at(0) * h_stride;
+
+        if(li == numlayer - 1)
+        {
+            for(int bs = 0; bs < batch_n_cpu; bs++)
+            {
+                for(int h = 0; h < out_h; h++)
+                {
+                    wkspace.at(hid_shift + bi * 5 * hy_h + bs * hy_stride + h) +=
+                        dout.at(bs * out_stride + h);
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift = (li + 1) * batch_n_cpu * hy_stride;
+
+            RNN_mm_cpu<T>(&wkspace[prelayer_shift],
+                          hy_h * bi * 4,
+                          batch_n_cpu,
+                          hy_stride,
+                          0,
+                          &wei[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi * 4,
+                          bi_stride,
+                          0,
+                          &wkspace[hid_shift + bi * 5 * hy_h],
+                          hy_h * bi,
+                          batch_n_cpu,
+                          hy_stride,
+                          0,
+                          1,
+                          1);
+
+            if(use_dropout_cpu)
+            {
+                DropoutBackwardVerify<T>(dropoutDesc,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         dropout_reservespace_host,
+                                         hid_shift + bi * 5 * hy_h,
+                                         hid_shift + bi * 5 * hy_h,
+                                         li * batch_n_cpu * hy_h * bi);
+            }
+        }
+
+        // from hidden state
+        int bacc   = batch_n_cpu;
+        int baccbi = 0;
+        for(int ti = seqLength_cpu - 1; ti >= 0; ti--)
+        {
+            bacc -= in_n.at(ti);
+
+            if(ti == seqLength_cpu - 1)
+            {
+                for(int bs = 0; bs < in_n.at(ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        if(!dhy_is_null)
+                        {
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
+                                dhy_cpu.at(hx_shift + bs * uni_stride + h);
+                        }
+                        if(!dcy_is_null)
+                        {
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                                dcy_cpu.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                }
+
+                if(bidirection == 1)
+                {
+                    for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            if(!dhy_is_null)
+                            {
+                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
+                                           hy_h + h) +=
+                                    dhy_cpu.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
+                            }
+                            if(!dcy_is_null)
+                            {
+                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                           hy_h + h) +=
+                                    dcy_cpu.at(hx_shift + bs * uni_stride + hy_n * hy_h + h);
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if(!dhy_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                {
+                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) +=
+                                dhy_cpu.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                }
+
+                if(!dcy_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                {
+                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                                dcy_cpu.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                }
+
+                int pretime_shift = li * batch_n_cpu * hy_stride + (bacc + in_n.at(ti)) * hy_stride;
+                int weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+
+                RNN_mm_cpu<T>(&wkspace[pretime_shift],
+                              hy_h * 4,
+                              in_n.at(ti + 1),
+                              hy_stride,
+                              0,
+                              &wei[weitime_shift],
+                              hy_h,
+                              hy_h * 4,
+                              uni_stride,
+                              0,
+                              &wkspace[hid_shift + bacc * hy_stride + bi * 5 * hy_h],
+                              hy_h,
+                              in_n.at(ti + 1),
+                              hy_stride,
+                              0,
+                              1,
+                              1);
+
+                if(bidirection == 1)
+                {
+                    pretime_shift = li * batch_n_cpu * hy_stride +
+                                    (baccbi - in_n.at(seqLength_cpu - 2 - ti)) * hy_stride +
+                                    hy_h * 4;
+                    weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride +
+                                    hy_h * 4 * uni_stride;
+
+                    RNN_mm_cpu<T>(&wkspace[pretime_shift],
+                                  hy_h * 4,
+                                  in_n.at(seqLength_cpu - 1 - ti),
+                                  hy_stride,
+                                  0,
+                                  &wei[weitime_shift],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  0,
+                                  &wkspace[hid_shift + baccbi * hy_stride + bi * 5 * hy_h + hy_h],
+                                  hy_h,
+                                  in_n.at(seqLength_cpu - 1 - ti),
+                                  hy_stride,
+                                  0,
+                                  1,
+                                  1);
+                }
+            }
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    if(ti < seqLength_cpu - 1)
+                    {
+                        if(bs < in_n.at(ti + 1))
+                        {
+                            int pretime_shift =
+                                li * batch_n_cpu * hy_stride + (bacc + in_n.at(ti)) * hy_stride;
+
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                                wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
+                                activfunc(rsvspace.at(pretime_shift + bs * hy_stride + hy_h + h),
+                                          2);
+                        }
+                    }
+                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) +=
+                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) *
+                        dervactivfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
+                            1) *
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h),
+                                  2);
+
+                    if(ti == 0)
+                    {
+                        if(!cx_is_null)
+                        {
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h) +=
+                                wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h +
+                                           h) *
+                                cx.at(hx_shift + bs * uni_stride + h) *
+                                dervactivfunc(
+                                    rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
+                        }
+                    }
+                    else
+                    {
+                        int pretime_shift =
+                            li * batch_n_cpu * hy_stride + (bacc - in_n.at(ti - 1)) * hy_stride;
+
+                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h) +=
+                            wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
+                            rsvspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
+                            dervactivfunc(
+                                rsvspace.at(hid_shift + (bacc + bs) * hy_stride + hy_h + h), 2);
+                    }
+                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + h) +=
+                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h),
+                                  1) *
+                        dervactivfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2);
+                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h) +=
+                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 5 * hy_h + h) *
+                        activfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h),
+                            1) *
+                        dervactivfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h), 2);
+                    wkspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h) +=
+                        wkspace.at(hid_shift + (bacc + bs) * hy_stride + bi * 4 * hy_h + h) *
+                        activfunc(rsvspace.at(hid_shift + (bacc + bs) * hy_stride + h), 2) *
+                        dervactivfunc(
+                            rsvspace.at(hid_shift + (bacc + bs) * hy_stride + 3 * hy_h + h), 1);
+                }
+            }
+
+            if(bidirection == 1)
+            {
+                for(int bs = 0; bs < in_n.at(seqLength_cpu - 1 - ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        if(ti < seqLength_cpu - 1)
+                        {
+                            int pretime_shift =
+                                li * batch_n_cpu * hy_stride +
+                                (baccbi - in_n.at(seqLength_cpu - 2 - ti)) * hy_stride;
+
+                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                       hy_h + h) +=
+                                wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + hy_h +
+                                           h) *
+                                activfunc(
+                                    rsvspace.at(pretime_shift + bs * hy_stride + 5 * hy_h + h), 2);
+                        }
+                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h + hy_h +
+                                   h) +=
+                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
+                                       hy_h + h) *
+                            dervactivfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                      bi * 4 * hy_h + hy_h + h),
+                                          1) *
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
+                                2);
+
+                        if(ti == 0)
+                        {
+                            if(!cx_is_null)
+                            {
+                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
+                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                               bi * 4 * hy_h + hy_h + h) *
+                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) *
+                                    dervactivfunc(rsvspace.at(hid_shift +
+                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
+                                                              h),
+                                                  2);
+                            }
+                        }
+                        else
+                        {
+                            if(!cx_is_null &&
+                               in_n.at(seqLength_cpu - 1 - ti) > in_n.at(seqLength_cpu - ti) &&
+                               bs >= in_n.at(seqLength_cpu - ti))
+                            {
+                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
+                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                               bi * 4 * hy_h + hy_h + h) *
+                                    cx.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) *
+                                    dervactivfunc(rsvspace.at(hid_shift +
+                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
+                                                              h),
+                                                  2);
+                            }
+
+                            if(bs < in_n.at(seqLength_cpu - ti))
+                            {
+                                int pretime_shift =
+                                    li * batch_n_cpu * hy_stride +
+                                    (baccbi + in_n.at(seqLength_cpu - 1 - ti)) * hy_stride;
+
+                                wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h) +=
+                                    wkspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                               bi * 4 * hy_h + hy_h + h) *
+                                    rsvspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h +
+                                                hy_h + h) *
+                                    dervactivfunc(rsvspace.at(hid_shift +
+                                                              (baccbi + bs) * hy_stride + 5 * hy_h +
+                                                              h),
+                                                  2);
+                            }
+                        }
+                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h) +=
+                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                       hy_h + h) *
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
+                                1) *
+                            dervactivfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
+                                2);
+                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h) +=
+                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 5 * hy_h +
+                                       hy_h + h) *
+                            activfunc(rsvspace.at(hid_shift + (baccbi + bs) * hy_stride +
+                                                  bi * 4 * hy_h + hy_h + h),
+                                      1) *
+                            dervactivfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 6 * hy_h + h),
+                                2);
+                        wkspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h) +=
+                            wkspace.at(hid_shift + (baccbi + bs) * hy_stride + bi * 4 * hy_h +
+                                       hy_h + h) *
+                            activfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h),
+                                2) *
+                            dervactivfunc(
+                                rsvspace.at(hid_shift + (baccbi + bs) * hy_stride + 7 * hy_h + h),
+                                1);
+                    }
+                }
+            }
+
+            baccbi += in_n.at(seqLength_cpu - 1 - ti);
+        }
+
+        // dcx, dhx
+        int pretime_shift = li * batch_n_cpu * hy_stride;
+        int weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+
+        RNN_mm_cpu<T>(&wkspace[pretime_shift],
+                      hy_h * 4,
+                      in_n.at(0),
+                      hy_stride,
+                      0,
+                      &wei[weitime_shift],
+                      hy_h,
+                      hy_h * 4,
+                      uni_stride,
+                      0,
+                      &dhx_host[hx_shift],
+                      hy_h,
+                      in_n.at(0),
+                      uni_stride,
+                      0,
+                      1,
+                      1);
+
+        for(int bs = 0; bs < in_n.at(0); bs++)
+        {
+            for(int h = 0; h < hy_h; h++)
+            {
+                dcx_host.at(hx_shift + bs * uni_stride + h) +=
+                    wkspace.at(pretime_shift + bs * hy_stride + bi * 4 * hy_h + h) *
+                    activfunc(rsvspace.at(pretime_shift + bs * hy_stride + hy_h + h), 2);
+            }
+        }
+
+        if(bidirection == 1)
+        {
+            int ti = seqLength_cpu - 1, cur_bat = 0, pre_bat = batch_n_cpu;
+
+            while(ti >= 0)
+            {
+                pre_bat -= in_n.at(ti);
+                if(in_n.at(ti) > cur_bat)
+                {
+                    pretime_shift = li * batch_n_cpu * hy_stride + (pre_bat + cur_bat) * hy_stride;
+
+                    RNN_mm_cpu<T>(&wkspace[pretime_shift + 4 * hy_h],
+                                  hy_h * 4,
+                                  (in_n.at(ti) - cur_bat),
+                                  hy_stride,
+                                  0,
+                                  &wei[weitime_shift + 4 * hy_h * uni_stride],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  0,
+                                  &dhx_host[hx_shift + hy_n * hy_h + cur_bat * hy_h],
+                                  hy_h,
+                                  (in_n.at(ti) - cur_bat),
+                                  uni_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    for(int bs = cur_bat; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            dcx_host.at(hx_shift + bs * uni_stride + hy_n * hy_h + h) +=
+                                wkspace.at(pretime_shift + (bs - cur_bat) * hy_stride +
+                                           bi * 4 * hy_h + hy_h + h) *
+                                activfunc(rsvspace.at(pretime_shift + (bs - cur_bat) * hy_stride +
+                                                      5 * hy_h + h),
+                                          2);
+                        }
+                    }
+                }
+                cur_bat = in_n.at(ti--);
+            }
+        }
+    }
+
+    // dinput
+    if(inputMode_cpu == 1)
+    {
+        for(int bs = 0; bs < batch_n_cpu; bs++)
+        {
+            for(int h = 0; h < hy_h; h++)
+            {
+                for(int gi = 0; gi < 4; gi++)
+                {
+                    din_host.at(bs * in_stride + h) += wkspace.at(bs * hy_stride + gi * hy_h + h);
+                    if(bidirection == 1)
+                    {
+                        din_host.at(bs * in_stride + h) +=
+                            wkspace.at(bs * hy_stride + (gi + 4) * hy_h + h);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        RNN_mm_cpu<T>(wkspace.data(),
+                      hy_h * bi * 4,
+                      batch_n_cpu,
+                      hy_stride,
+                      0,
+                      wei.data(),
+                      in_h,
+                      hy_h * bi * 4,
+                      in_stride,
+                      0,
+                      din_host.data(),
+                      in_h,
+                      batch_n_cpu,
+                      in_stride,
+                      0,
+                      1,
+                      1);
+    }
+}
+
+template <class T>
+void LSTMBwdWeightCPUVerify(bool use_dropout_cpu,
+                            const std::vector<T>& in,
+                            std::vector<T>& dwei_host, // [ input_state_weight_trans
+                                                       // hidden_state_weight0_trans
+                                                       // input1_trans hidden1_trans ...
+                                                       // output_weight; bidirectional
+                                                       // reversed weights ]
+                            const std::vector<T>& hx,  // initial hidden state
+                            const std::vector<T>& dout,
+                            const std::vector<int>& in_n, // input batch size
+                            int in_h,                     // input data length
+                            int seqLength_cpu,            // Number of iterations to unroll over
+                            int bidirection,              // whether using bidirectional net
+                            int biased,                   // whether using bias
+                            int hy_d,  // 1 by numlayer (number of stacks of hidden
+                                       // layers) for unidirection, 2 by numlayer for
+                                       // bidirection
+                            int hy_n,  // equal to input batch size in_n[0]
+                            int hy_h,  // hidden state number
+                            int out_h, // 1 by hy_h related function for unidirection, 2
+                                       // by hy_h related function for bidirection
+                            int inputMode_cpu,
+                            const std::vector<T>& rsvspace,
+                            const std::vector<T>& wkspace,
+                            bool hx_is_null)
+{
+    int batch_n_cpu = sumvc(in_n);
+    int numlayer    = bidirection == 1 ? hy_d / 2 : hy_d;
+    int bi          = bidirection == 1 ? 2 : 1;
+
+    int in_stride  = in_h;
+    int wei_stride = bi * 4 * hy_h;
+    int hy_stride  = bi * 6 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+    (void)dout;
+    (void)out_h;
+
+    if(inputMode_cpu == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
+
+    // bwd weights emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        // between layers
+        if(li == 0)
+        {
+            if(inputMode_cpu != 1)
+            {
+                RNN_mm_cpu<T>(wkspace.data(),
+                              hy_h * bi * 4,
+                              batch_n_cpu,
+                              hy_stride,
+                              RNN_MM_TRANSPOSE,
+                              in.data(),
+                              in_h,
+                              batch_n_cpu,
+                              in_stride,
+                              0,
+                              dwei_host.data(),
+                              in_h,
+                              hy_h * bi * 4,
+                              in_stride,
+                              0,
+                              1,
+                              1);
+            }
+
+            if(biased == 1)
+            {
+                for(int h = 0; h < wei_stride; h++)
+                {
+                    for(int w = 0; w < batch_n_cpu; w++)
+                    {
+                        dwei_host.at(wei_shift_bias + h) += wkspace.at(w * hy_stride + h);
+                    }
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift =
+                use_dropout_cpu
+                    ? 2 * numlayer * batch_n_cpu * hy_stride + (li - 1) * batch_n_cpu * hy_h * bi
+                    : (li - 1) * batch_n_cpu * hy_stride + bi * hy_h * 5;
+            int hid_shift = li * batch_n_cpu * hy_stride;
+            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
+
+            RNN_mm_cpu<T>(&wkspace[hid_shift],
+                          hy_h * bi * 4,
+                          batch_n_cpu,
+                          hy_stride,
+                          RNN_MM_TRANSPOSE,
+                          &rsvspace[prelayer_shift],
+                          hy_h * bi,
+                          batch_n_cpu,
+                          use_dropout_cpu ? hy_h * bi : hy_stride,
+                          0,
+                          &dwei_host[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi * 4,
+                          bi_stride,
+                          0,
+                          1,
+                          1);
+
+            if(biased == 1)
+            {
+                wei_shift = wei_shift_bias + li * 2 * wei_stride;
+
+                for(int h = 0; h < wei_stride; h++)
+                {
+                    for(int w = 0; w < batch_n_cpu; w++)
+                    {
+                        dwei_host.at(wei_shift + h) += wkspace.at(hid_shift + w * hy_stride + h);
+                    }
+                }
+            }
+        }
+
+        // between time
+        int bacc = 0;
+        for(int ti = 0; ti < seqLength_cpu; ti++)
+        {
+            int hid_shift = li * batch_n_cpu * hy_stride + bacc * hy_stride;
+            int hx_shift  = li * in_n.at(0) * h_stride;
+            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+            int pretime_shift;
+
+            // between time
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu<T>(&wkspace[hid_shift],
+                                  hy_h * 4,
+                                  in_n.at(ti),
+                                  hy_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &hx[hx_shift],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  uni_stride,
+                                  0,
+                                  &dwei_host[wei_shift],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    if(biased == 1)
+                    {
+                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                        for(int h = 0; h < hy_h * 4; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti); w++)
+                            {
+                                dwei_host.at(bias_shift + h) +=
+                                    wkspace.at(hid_shift + w * hy_stride + h);
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                pretime_shift = li * batch_n_cpu * hy_stride +
+                                (bacc - in_n.at(ti - 1)) * hy_stride + bi * 5 * hy_h;
+
+                RNN_mm_cpu<T>(&wkspace[hid_shift],
+                              hy_h * 4,
+                              in_n.at(ti),
+                              hy_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[pretime_shift],
+                              hy_h,
+                              in_n.at(ti),
+                              hy_stride,
+                              0,
+                              &dwei_host[wei_shift],
+                              hy_h,
+                              hy_h * 4,
+                              uni_stride,
+                              0,
+                              1,
+                              1);
+
+                if(biased == 1)
+                {
+                    int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                    for(int h = 0; h < hy_h * 4; h++)
+                    {
+                        for(int w = 0; w < in_n.at(ti); w++)
+                        {
+                            dwei_host.at(bias_shift + h) +=
+                                wkspace.at(hid_shift + w * hy_stride + h);
+                        }
+                    }
+                }
+            }
+
+            if(bidirection == 1)
+            {
+                if(ti == seqLength_cpu - 1)
+                {
+                    if(!hx_is_null)
+                    {
+                        RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h],
+                                      hy_h * 4,
+                                      in_n.at(ti),
+                                      hy_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &hx[hx_shift + hy_n * hy_h],
+                                      hy_h,
+                                      in_n.at(ti),
+                                      uni_stride,
+                                      0,
+                                      &dwei_host[wei_shift + 4 * hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h * 4,
+                                      uni_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        if(biased == 1)
+                        {
+                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                            for(int h = 0; h < hy_h * 4; h++)
+                            {
+                                for(int w = 0; w < in_n.at(ti); w++)
+                                {
+                                    dwei_host.at(bias_shift + hy_h * 4 + h) +=
+                                        wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if(!hx_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                    {
+                        RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h + in_n.at(ti + 1) * hy_stride],
+                                      hy_h * 4,
+                                      (in_n.at(ti) - in_n.at(ti + 1)),
+                                      hy_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &hx[hx_shift + hy_n * hy_h + in_n.at(ti + 1) * hy_h],
+                                      hy_h,
+                                      (in_n.at(ti) - in_n.at(ti + 1)),
+                                      uni_stride,
+                                      0,
+                                      &dwei_host[wei_shift + 4 * hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h * 4,
+                                      uni_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        if(biased == 1)
+                        {
+                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                            for(int h = 0; h < hy_h * 4; h++)
+                            {
+                                for(int w = in_n.at(ti + 1); w < in_n.at(ti); w++)
+                                {
+                                    dwei_host.at(bias_shift + hy_h * 4 + h) +=
+                                        wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
+                                }
+                            }
+                        }
+                    }
+
+                    pretime_shift = li * batch_n_cpu * hy_stride +
+                                    (bacc + in_n.at(ti)) * hy_stride + bi * 5 * hy_h;
+
+                    RNN_mm_cpu<T>(&wkspace[hid_shift + 4 * hy_h],
+                                  hy_h * 4,
+                                  in_n.at(ti + 1),
+                                  hy_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[pretime_shift + hy_h],
+                                  hy_h,
+                                  in_n.at(ti + 1),
+                                  hy_stride,
+                                  0,
+                                  &dwei_host[wei_shift + 4 * hy_h * uni_stride],
+                                  hy_h,
+                                  hy_h * 4,
+                                  uni_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    if(biased == 1)
+                    {
+                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                        for(int h = 0; h < hy_h * 4; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti + 1); w++)
+                            {
+                                dwei_host.at(bias_shift + hy_h * 4 + h) +=
+                                    wkspace.at(hid_shift + hy_h * 4 + w * hy_stride + h);
+                            }
+                        }
+                    }
+                }
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+}
+//////=========END CPU VERIFICATION FUNCTIONS=============
+
+/**********************************************
+ * RNN TANH_RELU CPU verification functions
+ * rnn_vanilla_common.hpp
+ **********************************************/
+template <typename T>
+void RNNFwdTrainCPUVerify(miopen::Handle& handle,
+                          bool use_dropout,
+                          const miopen::DropoutDescriptor& dropoutDesc,
+                          const std::vector<T>& in,
+                          const std::vector<T>& wei, // [ input_state_weight_trans
+                                                     // hidden_state_weight0_trans input1_trans
+                                                     // hidden1_trans ... output_weight;
+                                                     // bidirectional reversed weights ]
+                          std::vector<T>& hy_host,   // current/final hidden state
+                          const std::vector<T>& hx,  // initial hidden state
+                          std::vector<T>& out_host,
+                          const std::vector<int>& in_n, // input batch size
+                          int in_h,                     // input data length
+                          int seqLength,                // Number of iterations to unroll over
+                          int bidirection,              // whether using bidirectional net
+                          int biased,                   // whether using bias
+                          int hy_d,  // 1 by numlayer (number of stacks of hidden layers) for
+                                     // unidirection, 2 by numlayer for bidirection
+                          int hy_n,  // equal to input batch size in_n[0]
+                          int hy_h,  // hidden state number
+                          int out_h, // 1 by hy_h related function for unidirection, 2 by hy_h
+                                     // related function for bidirection
+                          int squash,
+                          int inputMode,
+                          std::vector<T>& rsvspace,
+                          bool hx_is_null = false)
+{
+
+    int batch_n = sumvc(in_n);
+
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int hy_stride  = hy_h * bi;
+    int out_stride = out_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_shift_bias = ((in_h + hy_h) * bi + (bi * hy_h + hy_h) * bi * (numlayer - 1)) * hy_h;
+
+    // initial dropoput
+    std::vector<prngStates> dropout_states_host;
+    std::vector<unsigned char> dropout_reservespace_host;
+    std::vector<T> dropout_hid_state;
+    miopenTensorDescriptor_t dropout_inputTensor{}, dropout_outputTensor{};
+    if(use_dropout)
+    {
+        size_t states_size  = dropoutDesc.stateSizeInBytes / sizeof(prngStates);
+        dropout_states_host = std::vector<prngStates>(states_size);
+        InitKernelStateEmulator(dropout_states_host, dropoutDesc);
+
+        std::array<int, 2> drop_in_len  = {{batch_n, hy_h * bi}};
+        std::array<int, 2> drop_in_str  = {{hy_stride, 1}};
+        std::array<int, 2> drop_out_str = {{hy_h * bi, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenCreateTensorDescriptor(&dropout_outputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+        miopenSetTensorDescriptor(
+            dropout_outputTensor, miopenFloat, 2, drop_in_len.data(), drop_out_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(1));
+
+        dropout_hid_state = std::vector<T>((numlayer - 1) * batch_n * hy_h * bi, static_cast<T>(0));
+    }
+
+    // forward emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        int hid_shift = li * batch_n * hy_h * bi;
+        int hx_shift  = li * bi * in_n.at(0) * hy_h;
+
+        // from input
+        if(li == 0)
+        {
+            if(inputMode == 1)
+            {
+                // for(int bs = 0; bs < batch_n; bs++)
+                par_for(batch_n, 4, [&](int bs) {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        rsvspace.at(hid_shift + bs * hy_stride + h) += in.at(bs * in_stride + h);
+                        if(bidirection)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + hy_h + h) +=
+                                in.at(bs * in_stride + h);
+                        }
+                    }
+                });
+
+                // from bias
+                if(biased)
+                {
+                    // for(int bs = 0; bs < batch_n; bs++)
+                    par_for(batch_n, 4, [&](int bs) {
+                        for(int h = 0; h < hy_stride; h++)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias + h);
+                        }
+                    });
+                }
+            }
+            else
+            {
+                RNN_mm_cpu<T>(in.data(),
+                              in_h,
+                              batch_n,
+                              in_stride,
+                              0,
+                              wei.data(),
+                              in_h,
+                              hy_h * bi,
+                              in_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[hid_shift],
+                              hy_h * bi,
+                              batch_n,
+                              hy_stride,
+                              0,
+                              1,
+                              1);
+
+                // from bias
+                if(biased)
+                {
+                    // for(int bs = 0; bs < batch_n; bs++)
+                    par_for(batch_n, 4, [&](int bs) {
+                        for(int h = 0; h < hy_stride; h++)
+                        {
+                            rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias + h);
+                        }
+                    });
+                }
+            }
+        }
+        else
+        {
+            int wei_shift = bi * (in_h + hy_h) * hy_h + (li - 1) * bi * (bi * hy_h + hy_h) * hy_h;
+            int prelayer_shift = (li - 1) * batch_n * hy_h * bi + numlayer * batch_n * hy_h * bi;
+            if(use_dropout)
+            {
+                auto dropout_states_tmp = dropout_states_host;
+                size_t drop_out_offset  = (li - 1ULL) * batch_n * hy_h * bi;
+
+                DropoutForwardVerify<T>(handle,
+                                        dropoutDesc,
+                                        miopen::deref(dropout_inputTensor),
+                                        rsvspace,
+                                        miopen::deref(dropout_outputTensor),
+                                        dropout_hid_state,
+                                        dropout_reservespace_host,
+                                        dropout_states_tmp,
+                                        prelayer_shift,
+                                        drop_out_offset,
+                                        drop_out_offset);
+
+                prelayer_shift = drop_out_offset;
+            }
+
+            RNN_mm_cpu<T>(use_dropout ? &dropout_hid_state[prelayer_shift]
+                                      : &rsvspace[prelayer_shift],
+                          hy_h * bi,
+                          batch_n,
+                          use_dropout ? hy_h * bi : hy_stride,
+                          0,
+                          &wei[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi,
+                          bi_stride,
+                          RNN_MM_TRANSPOSE,
+                          &rsvspace[hid_shift],
+                          hy_h * bi,
+                          batch_n,
+                          hy_stride,
+                          0,
+                          1,
+                          1);
+
+            // from bias
+            if(biased)
+            {
+                int wei_shift_bias_temp = wei_shift_bias + bi * li * 2 * hy_h;
+
+                // for(int bs = 0; bs < batch_n; bs++)
+                par_for(batch_n, 4, [&](int bs) {
+                    for(int h = 0; h < hy_stride; h++)
+                    {
+                        rsvspace.at(hid_shift + bs * hy_stride + h) +=
+                            wei.at(wei_shift_bias_temp + h);
+                    }
+                });
+            }
+        }
+
+        // from hidden state
+        int bacc   = 0;
+        int baccbi = batch_n;
+        for(int ti = 0; ti < seqLength; ti++)
+        {
+            baccbi -= in_n.at(seqLength - 1 - ti);
+
+            int wei_shift =
+                li == 0 ? (in_h * hy_h * bi)
+                        : (bi * (in_h + hy_h) * hy_h + (li - 1) * bi * (bi * hy_h + hy_h) * hy_h +
+                           bi * hy_h * hy_stride);
+
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu<T>(&hx[hx_shift],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  uni_stride,
+                                  0,
+                                  &wei[wei_shift],
+                                  hy_h,
+                                  hy_h,
+                                  uni_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[hid_shift + bacc * hy_stride],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  hy_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    // from bias
+                    if(biased)
+                    {
+                        int wei_shift_bias_temp = wei_shift_bias + bi * (li * 2 + 1) * hy_h;
+
+                        par_for(in_n.at(ti), 4, [&](int bs) {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                    wei.at(wei_shift_bias_temp + h);
+                            }
+                        });
+                    }
+
+                    if(bidirection)
+                    {
+                        RNN_mm_cpu<T>(&hx[hx_shift + hy_n * hy_h],
+                                      hy_h,
+                                      in_n.at(seqLength - 1 - ti),
+                                      uni_stride,
+                                      0,
+                                      &wei[wei_shift + hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h,
+                                      uni_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &rsvspace[hid_shift + baccbi * hy_stride + hy_h],
+                                      hy_h,
+                                      in_n.at(seqLength - 1 - ti),
+                                      hy_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        // from bias
+                        if(biased)
+                        {
+                            int wei_shift_bias_temp = wei_shift_bias + bi * (li * 2 + 1) * hy_h;
+
+                            par_for(in_n.at(seqLength - 1 - ti), 4, [&](int bs) {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    rsvspace.at(hid_shift + baccbi * hy_stride + hy_h +
+                                                bs * hy_stride + h) +=
+                                        wei.at(wei_shift_bias_temp + hy_h + h);
+                                }
+                            });
+                        }
+                    }
+                }
+            }
+            else
+            {
+                RNN_mm_cpu<T>(&hy_host[hx_shift],
+                              hy_h,
+                              in_n.at(ti),
+                              uni_stride,
+                              0,
+                              &wei[wei_shift],
+                              hy_h,
+                              hy_h,
+                              uni_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[hid_shift + bacc * hy_stride],
+                              hy_h,
+                              in_n.at(ti),
+                              hy_stride,
+                              0,
+                              1,
+                              1);
+
+                // from bias
+                if(biased)
+                {
+                    int wei_shift_bias_temp = wei_shift_bias + bi * (li * 2 + 1) * hy_h;
+
+                    par_for(in_n.at(ti), 4, [&](int bs) {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                wei.at(wei_shift_bias_temp + h);
+                        }
+                    });
+                }
+
+                if(bidirection)
+                {
+
+                    if(!hx_is_null && in_n.at(seqLength - 1 - ti) > in_n.at(seqLength - ti))
+                    {
+                        RNN_mm_cpu<T>(
+                            &hx[hx_shift + hy_n * hy_h + in_n.at(seqLength - ti) * hy_h],
+                            hy_h,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            uni_stride,
+                            0,
+                            &wei[wei_shift + hy_h * uni_stride],
+                            hy_h,
+                            hy_h,
+                            uni_stride,
+                            RNN_MM_TRANSPOSE,
+                            &rsvspace[hid_shift + (baccbi + in_n.at(seqLength - ti)) * hy_stride +
+                                      hy_h],
+                            hy_h,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            hy_stride,
+                            0,
+                            1,
+                            1);
+
+                        // from bias
+                        if(biased)
+                        {
+                            int wei_shift_bias_temp = wei_shift_bias + bi * (li * 2 + 1) * hy_h;
+
+                            for(int bs = in_n.at(seqLength - ti); bs < in_n.at(seqLength - 1 - ti);
+                                bs++)
+                            {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    rsvspace.at(hid_shift + baccbi * hy_stride + hy_h +
+                                                bs * hy_stride + h) +=
+                                        wei.at(wei_shift_bias_temp + hy_h + h);
+                                }
+                            }
+                        }
+                    }
+
+                    RNN_mm_cpu<T>(&hy_host[hx_shift + hy_n * hy_h],
+                                  hy_h,
+                                  in_n.at(seqLength - ti),
+                                  uni_stride,
+                                  0,
+                                  &wei[wei_shift + hy_h * uni_stride],
+                                  hy_h,
+                                  hy_h,
+                                  uni_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[hid_shift + baccbi * hy_stride + hy_h],
+                                  hy_h,
+                                  in_n.at(seqLength - ti),
+                                  hy_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    // from bias
+                    if(biased)
+                    {
+                        int wei_shift_bias_temp = wei_shift_bias + bi * (li * 2 + 1) * hy_h;
+
+                        par_for(in_n.at(seqLength - ti), 4, [&](int bs) {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                rsvspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride +
+                                            h) += wei.at(wei_shift_bias_temp + hy_h + h);
+                            }
+                        });
+                    }
+                }
+            }
+
+            // for(int bs = 0; bs < in_n[ti]; bs++)
+            par_for(in_n.at(ti), 4, [&](int bs) {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    hy_host.at(hx_shift + bs * uni_stride + h) =
+                        activfunc(rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h),
+                                  squash); // squash_func
+
+                    rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h +
+                                numlayer * batch_n * hy_h * bi) =
+                        activfunc(rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h),
+                                  squash); // squash_func
+                }
+            });
+
+            if(bidirection)
+            {
+                // for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                par_for(in_n.at(seqLength - 1 - ti), 4, [&](int bs) {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        hy_host.at(hx_shift + hy_n * hy_h + bs * uni_stride + h) = activfunc(
+                            rsvspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride + h),
+                            squash); // squash_func
+
+                        rsvspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride + h +
+                                    numlayer * batch_n * hy_h * bi) =
+                            activfunc(rsvspace.at(hid_shift + baccbi * hy_stride + hy_h +
+                                                  bs * hy_stride + h),
+                                      squash);
+                    }
+                });
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+
+    // output
+    int prelayer_shift = (numlayer - 1) * batch_n * hy_h * bi + numlayer * batch_n * hy_h * bi;
+
+    if(use_dropout)
+    {
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                rsvspace.at(dropout_offset + i) = dropout_hid_state.at(i);
+            }
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                *(p_drop_rsv + i) = dropout_reservespace_host.at(i);
+            }
+        }
+    }
+    for(int bs = 0; bs < batch_n; bs++)
+    {
+        for(int h = 0; h < out_h; h++)
+        {
+            assert(!std::isnan(rsvspace.at(prelayer_shift + bs * hy_stride + h)));
+            assert(!std::isinf(rsvspace.at(prelayer_shift + bs * hy_stride + h)));
+            out_host.at(bs * out_stride + h) = rsvspace.at(prelayer_shift + bs * hy_stride + h);
+            //  printf("out_host[%d]: %f\n", bs * out_stride + h, out_host.at(bs * out_stride + h));
+        }
+    }
+}
+
+template <typename T>
+void RNNBwdDataCPUVerify(bool use_dropout,
+                         const miopen::DropoutDescriptor& dropoutDesc,
+                         std::vector<T>& din_host,
+                         const std::vector<T>& wei, // [ input_state_weight_trans
+                                                    // hidden_state_weight0_trans input1_trans
+                                                    // hidden1_trans ... output_weight;
+                                                    // bidirectional reversed weights ]
+                         const std::vector<T>& dhy, // current/final hidden state
+                         std::vector<T>& dhx_host,
+                         const std::vector<T>&, // initial hidden state
+                         const std::vector<T>&,
+                         const std::vector<T>& dout,
+                         const std::vector<int>& in_n, // input batch size
+                         int in_h,                     // input data length
+                         int seqLength,                // Number of iterations to unroll over
+                         int bidirection,              // whether using bidirectional net
+                         int,                          // whether using bias
+                         int hy_d,  // 1 by numlayer (number of stacks of hidden layers)
+                                    // for unidirection, 2 by numlayer for bidirection
+                         int hy_n,  // equal to input batch size in_n[0]
+                         int hy_h,  // hidden state number
+                         int out_h, // 1 by hy_h related function for unidirection, 2 by
+                                    // hy_h related function for bidirection
+                         int squash,
+                         int inputMode,
+                         std::vector<T>& rsvspace,
+                         std::vector<T>& wkspace,
+                         bool dhy_is_null = false)
+{
+
+    int batch_n = sumvc(in_n);
+
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int hy_stride  = hy_h * bi;
+    int out_stride = out_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    // initial dropoput
+    miopenTensorDescriptor_t dropout_inputTensor{};
+    std::vector<unsigned char> dropout_reservespace_host;
+    if(use_dropout)
+    {
+        std::array<int, 2> drop_in_len = {{batch_n, hy_h * bi}};
+        std::array<int, 2> drop_in_str = {{hy_stride, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(0));
+
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                dropout_reservespace_host.at(i) = *(p_drop_rsv + i);
+            }
+        }
+    }
+
+    // bwd data emulator
+    for(int li = numlayer - 1; li >= 0; li--)
+    {
+        int wei_shift = bi * (in_h + hy_h) * hy_h + li * bi * (bi * hy_h + hy_h) * hy_h;
+        int hid_shift = li * batch_n * hy_h * bi;
+        int hx_shift  = li * bi * in_n.at(0) * hy_h;
+
+        if(li == numlayer - 1)
+        {
+            for(int bs = 0; bs < batch_n; bs++)
+            {
+                for(int h = 0; h < out_h; h++)
+                {
+                    wkspace.at(hid_shift + bs * hy_stride + h) += dout.at(bs * out_stride + h);
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift = (li + 1) * batch_n * hy_h * bi;
+
+            RNN_mm_cpu<T>(&wkspace[prelayer_shift],
+                          hy_h * bi,
+                          batch_n,
+                          hy_stride,
+                          0,
+                          &wei[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi,
+                          bi_stride,
+                          0,
+                          &wkspace[hid_shift],
+                          hy_h * bi,
+                          batch_n,
+                          hy_stride,
+                          0,
+                          1,
+                          1);
+
+            if(use_dropout)
+            {
+                DropoutBackwardVerify<T>(dropoutDesc,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         dropout_reservespace_host,
+                                         hid_shift,
+                                         hid_shift,
+                                         li * batch_n * hy_h * bi);
+            }
+        }
+
+        int bacc   = batch_n;
+        int baccbi = 0;
+        for(int ti = seqLength - 1; ti >= 0; ti--)
+        {
+            bacc -= in_n.at(ti);
+
+            // from post state
+            if(ti == seqLength - 1)
+            {
+                if(!dhy_is_null)
+                {
+                    for(int bs = 0; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                dhy.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if(!dhy_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                {
+                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                                dhy.at(hx_shift + bs * uni_stride + h);
+                        }
+                    }
+                }
+
+                for(int bs = 0; bs < in_n.at(ti + 1); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        wkspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) +=
+                            dhx_host.at(hx_shift + bs * uni_stride + h);
+                    }
+                }
+            }
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    wkspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h) *= dervactivfunc(
+                        rsvspace.at(hid_shift + bacc * hy_stride + bs * hy_stride + h), squash);
+                }
+            }
+
+            if(ti < seqLength - 1)
+            {
+                for(int bs = 0; bs < in_n.at(ti + 1); bs++)
+                {
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 800)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+                    memset(&dhx_host[hx_shift + bs * uni_stride], 0, hy_h * sizeof(T));
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 800)
+#pragma GCC diagnostic pop
+#endif
+                }
+            }
+
+            wei_shift = li == 0
+                            ? (in_h * hy_stride)
+                            : (bi * (in_h + hy_h) * hy_h +
+                               (li - 1) * bi * (bi * hy_h + hy_h) * hy_h + bi * hy_h * hy_stride);
+
+            RNN_mm_cpu<T>(&wkspace[hid_shift + bacc * hy_stride],
+                          hy_h,
+                          in_n.at(ti),
+                          hy_stride,
+                          0,
+                          &wei[wei_shift],
+                          hy_h,
+                          hy_h,
+                          uni_stride,
+                          0,
+                          &dhx_host[hx_shift],
+                          hy_h,
+                          in_n.at(ti),
+                          uni_stride,
+                          0,
+                          1,
+                          1);
+
+            if(bidirection)
+            {
+                for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        // from post state
+                        if(ti == seqLength - 1)
+                        {
+                            if(!dhy_is_null)
+                            {
+                                wkspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride +
+                                           h) +=
+                                    dhy.at(hx_shift + hy_n * hy_h + bs * uni_stride + h);
+                            }
+                        }
+                        else
+                        {
+                            wkspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride +
+                                       h) +=
+                                dhx_host.at(hx_shift + hy_n * hy_h + bs * uni_stride + h);
+                        }
+
+                        wkspace.at(hid_shift + baccbi * hy_stride + hy_h + bs * hy_stride + h) *=
+                            dervactivfunc(rsvspace.at(hid_shift + baccbi * hy_stride + hy_h +
+                                                      bs * hy_stride + h),
+                                          squash);
+                    }
+                }
+
+                if(ti < seqLength - 1)
+                {
+                    for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                    {
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 800)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+                        memset(&dhx_host[hx_shift + bs * uni_stride + hy_n * hy_h],
+                               0,
+                               hy_h * sizeof(T));
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 800)
+#pragma GCC diagnostic pop
+#endif
+                    }
+                }
+
+                RNN_mm_cpu<T>(&wkspace[hid_shift + baccbi * hy_stride + hy_h],
+                              hy_h,
+                              in_n.at(seqLength - 1 - ti),
+                              hy_stride,
+                              0,
+                              &wei[wei_shift + hy_h * uni_stride],
+                              hy_h,
+                              hy_h,
+                              uni_stride,
+                              0,
+                              &dhx_host[hx_shift + hy_n * hy_h],
+                              hy_h,
+                              in_n.at(seqLength - 1 - ti),
+                              uni_stride,
+                              0,
+                              1,
+                              1);
+            }
+
+            baccbi += in_n.at(seqLength - 1 - ti);
+        }
+    }
+
+    // dinput
+    if(inputMode == 1)
+    {
+        for(int bs = 0; bs < batch_n; bs++)
+        {
+            for(int h = 0; h < hy_h; h++)
+            {
+                din_host.at(bs * in_stride + h) += wkspace.at(bs * hy_stride + h);
+                if(bidirection)
+                {
+                    din_host.at(bs * in_stride + h) += wkspace.at(bs * hy_stride + hy_h + h);
+                }
+            }
+        }
+    }
+    else
+    {
+        RNN_mm_cpu<T>(wkspace.data(),
+                      hy_h * bi,
+                      batch_n,
+                      hy_stride,
+                      0,
+                      wei.data(),
+                      in_h,
+                      hy_h * bi,
+                      in_stride,
+                      0,
+                      din_host.data(),
+                      in_h,
+                      batch_n,
+                      in_stride,
+                      0,
+                      1,
+                      1);
+    }
+}
+
+template <typename T>
+void RNNBwdWeightCPUVerify(bool use_dropout,
+                           const std::vector<T>& in,
+                           std::vector<T>& dwei_host, // [ input_state_weight_trans
+                                                      // hidden_state_weight0_trans
+                                                      // input1_trans hidden1_trans ...
+                                                      // output_weight; bidirectional
+                                                      // reversed weights ]
+                           const std::vector<T>& hx,  // initial hidden state
+                           const std::vector<T>& dout,
+                           const std::vector<int>& in_n, // input batch size
+                           int in_h,                     // input data length
+                           int seqLength,                // Number of iterations to unroll over
+                           bool bidirection,             // whether using bidirectional net
+                           bool biased,                  // whether using bias
+                           int hy_d,  // 1 by numlayer (number of stacks of hidden
+                                      // layers) for unidirection, 2 by numlayer for
+                                      // bidirection
+                           int hy_n,  // equal to input batch size in_n[0]
+                           int hy_h,  // hidden state number
+                           int out_h, // 1 by hy_h related function for unidirection, 2
+                                      // by hy_h related function for bidirection
+                           int squash,
+                           int inputMode,
+                           const std::vector<T>& rsvspace,
+                           const std::vector<T>& wkspace,
+                           bool hx_is_null = false)
+{
+
+    int batch_n  = sumvc(in_n);
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int hy_stride  = hy_h * bi;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    (void)hy_n;
+    (void)out_h;
+    (void)dout;
+    (void)squash;
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_len        = (bi * (in_h + hy_h) + (numlayer - 1) * bi * (bi + 1) * hy_h) * hy_h;
+    int wei_shift_bias = wei_len;
+
+    // bwd weights emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        // between layers
+        if(li == 0)
+        {
+            if(inputMode != 1)
+            {
+                RNN_mm_cpu<T>(wkspace.data(),
+                              hy_h * bi,
+                              batch_n,
+                              hy_stride,
+                              RNN_MM_TRANSPOSE,
+                              in.data(),
+                              in_h,
+                              batch_n,
+                              in_stride,
+                              0,
+                              dwei_host.data(),
+                              in_h,
+                              hy_h * bi,
+                              in_stride,
+                              0,
+                              1,
+                              1);
+            }
+
+            if(biased)
+            {
+                for(int h = 0; h < hy_stride; h++)
+                {
+                    for(int w = 0; w < batch_n; w++)
+                    {
+                        dwei_host.at(wei_shift_bias + h) += wkspace.at(w * hy_stride + h);
+                    }
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift =
+                use_dropout ? 2 * numlayer * batch_n * hy_stride + (li - 1) * batch_n * hy_h * bi
+                            : (li - 1) * bi * batch_n * hy_h + numlayer * batch_n * hy_h * bi;
+            int hid_shift = li * bi * batch_n * hy_h;
+            int wei_shift = bi * (in_h + hy_h) * hy_h + (li - 1) * bi * (bi * hy_h + hy_h) * hy_h;
+
+            RNN_mm_cpu<T>(&wkspace[hid_shift],
+                          hy_h * bi,
+                          batch_n,
+                          hy_stride,
+                          RNN_MM_TRANSPOSE,
+                          &rsvspace[prelayer_shift],
+                          hy_h * bi,
+                          batch_n,
+                          hy_stride,
+                          0,
+                          &dwei_host[wei_shift],
+                          hy_h * bi,
+                          hy_h * bi,
+                          bi_stride,
+                          0,
+                          1,
+                          1);
+
+            if(biased)
+            {
+                wei_shift = wei_shift_bias + li * bi * 2 * hy_h;
+
+                for(int h = 0; h < hy_stride; h++)
+                {
+                    for(int w = 0; w < batch_n; w++)
+                    {
+                        dwei_host.at(wei_shift + h) += wkspace.at(hid_shift + w * hy_stride + h);
+                    }
+                }
+            }
+        }
+
+        int bacc = 0;
+        for(int ti = 0; ti < seqLength; ti++)
+        {
+            int hid_shift = li * bi * batch_n * hy_h + bacc * hy_stride;
+            int hx_shift  = li * bi * in_n.at(0) * hy_h;
+            int wei_shift;
+            int pretime_shift;
+
+            wei_shift = li == 0
+                            ? (in_h * hy_stride)
+                            : (bi * (in_h + hy_h) * hy_h +
+                               (li - 1) * bi * (bi * hy_h + hy_h) * hy_h + bi * hy_h * hy_stride);
+
+            // between time
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu<T>(&wkspace[hid_shift],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  hy_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &hx[hx_shift],
+                                  hy_h,
+                                  in_n.at(ti),
+                                  uni_stride,
+                                  0,
+                                  &dwei_host[wei_shift],
+                                  hy_h,
+                                  hy_h,
+                                  uni_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    if(biased)
+                    {
+                        int bias_shift = wei_shift_bias + li * bi * 2 * hy_h + bi * hy_h;
+
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti); w++)
+                            {
+                                dwei_host.at(bias_shift + h) +=
+                                    wkspace.at(hid_shift + w * hy_stride + h);
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                pretime_shift = li * bi * batch_n * hy_h + (bacc - in_n.at(ti - 1)) * hy_stride +
+                                numlayer * batch_n * hy_h * bi;
+
+                RNN_mm_cpu<T>(&wkspace[hid_shift],
+                              hy_h,
+                              in_n.at(ti),
+                              hy_stride,
+                              RNN_MM_TRANSPOSE,
+                              &rsvspace[pretime_shift],
+                              hy_h,
+                              in_n.at(ti),
+                              hy_stride,
+                              0,
+                              &dwei_host[wei_shift],
+                              hy_h,
+                              hy_h,
+                              uni_stride,
+                              0,
+                              1,
+                              1);
+
+                if(biased)
+                {
+                    int bias_shift = wei_shift_bias + li * bi * 2 * hy_h + bi * hy_h;
+
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        for(int w = 0; w < in_n.at(ti); w++)
+                        {
+                            dwei_host.at(bias_shift + h) +=
+                                wkspace.at(hid_shift + w * hy_stride + h);
+                        }
+                    }
+                }
+            }
+
+            if(bidirection)
+            {
+                if(ti == seqLength - 1)
+                {
+                    if(!hx_is_null)
+                    {
+                        RNN_mm_cpu<T>(&wkspace[hid_shift + hy_h],
+                                      hy_h,
+                                      in_n.at(ti),
+                                      hy_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &hx[hx_shift + hy_n * hy_h],
+                                      hy_h,
+                                      in_n.at(ti),
+                                      uni_stride,
+                                      0,
+                                      &dwei_host[wei_shift + hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h,
+                                      uni_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        if(biased)
+                        {
+                            int bias_shift = wei_shift_bias + li * bi * 2 * hy_h + bi * hy_h;
+
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                for(int w = 0; w < in_n.at(ti); w++)
+                                {
+                                    dwei_host.at(bias_shift + hy_h + h) +=
+                                        wkspace.at(hid_shift + w * hy_stride + hy_h + h);
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if(!hx_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                    {
+                        RNN_mm_cpu<T>(&wkspace[hid_shift + hy_h + in_n.at(ti + 1) * hy_stride],
+                                      hy_h,
+                                      (in_n.at(ti) - in_n.at(ti + 1)),
+                                      hy_stride,
+                                      RNN_MM_TRANSPOSE,
+                                      &hx[hx_shift + hy_n * hy_h + in_n.at(ti + 1) * hy_h],
+                                      hy_h,
+                                      (in_n.at(ti) - in_n.at(ti + 1)),
+                                      uni_stride,
+                                      0,
+                                      &dwei_host[wei_shift + hy_h * uni_stride],
+                                      hy_h,
+                                      hy_h,
+                                      uni_stride,
+                                      0,
+                                      1,
+                                      1);
+
+                        if(biased)
+                        {
+                            int bias_shift = wei_shift_bias + li * bi * 2 * hy_h + bi * hy_h;
+
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                for(int w = in_n.at(ti + 1); w < in_n.at(ti); w++)
+                                {
+                                    dwei_host.at(bias_shift + hy_h + h) +=
+                                        wkspace.at(hid_shift + w * hy_stride + hy_h + h);
+                                }
+                            }
+                        }
+                    }
+
+                    pretime_shift = li * bi * batch_n * hy_h + (bacc + in_n.at(ti)) * hy_stride +
+                                    numlayer * batch_n * hy_h * bi;
+
+                    RNN_mm_cpu<T>(&wkspace[hid_shift + hy_h],
+                                  hy_h,
+                                  in_n.at(ti + 1),
+                                  hy_stride,
+                                  RNN_MM_TRANSPOSE,
+                                  &rsvspace[pretime_shift + hy_h],
+                                  hy_h,
+                                  in_n.at(ti + 1),
+                                  hy_stride,
+                                  0,
+                                  &dwei_host[wei_shift + hy_h * uni_stride],
+                                  hy_h,
+                                  hy_h,
+                                  uni_stride,
+                                  0,
+                                  1,
+                                  1);
+
+                    if(biased)
+                    {
+                        int bias_shift = wei_shift_bias + li * bi * 2 * hy_h + bi * hy_h;
+
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti + 1); w++)
+                            {
+                                dwei_host.at(bias_shift + hy_h + h) +=
+                                    wkspace.at(hid_shift + w * hy_stride + hy_h + h);
+                            }
+                        }
+                    }
+                }
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+}
+
+//////=========END RNN TANH_RELU CPU VERIFICATION FUNCTIONS=============
+
+/**********************************************
+ * GRU CPU verification functions
+ *
+ **********************************************/
+
+template <typename T>
+void GRUFwdCPUVerify(miopen::Handle& handle,
+                     bool use_dropout,
+                     const miopen::DropoutDescriptor& dropoutDesc,
+                     const std::vector<T>& in,
+                     const std::vector<T>& wei, // [ input_state_weight_trans
+                                                // hidden_state_weight0_trans input1_trans
+                                                // hidden1_trans ... output_weight;
+                                                // bidirectional reversed weights ]
+                     std::vector<T>& hy,        // current/final hidden state
+                     const std::vector<T>& hx,  // initial hidden state
+                     std::vector<T>& out,
+                     const std::vector<int>& in_n, // input batch size
+                     int in_h,                     // input data length
+                     int seqLength,                // Number of iterations to unroll over
+                     bool bidirection,             // whether using bidirectional net
+                     bool biased,                  // whether using bias
+                     int hy_d,  // 1 by numlayer (number of stacks of hidden layers) for
+                                // unidirection, 2 by numlayer for bidirection
+                     int hy_n,  // equal to input batch size in_n[0]
+                     int hy_h,  // hidden state number
+                     int out_h, // 1 by hy_h related function for unidirection, 2 by hy_h
+                                // related function for bidirection
+                     int inputMode,
+                     std::vector<T>& rsvspace,
+                     bool hx_is_null = false)
+{
+    int batch_n = sumvc(in_n);
+
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int out_stride = out_h;
+    int wei_stride = bi * 3 * hy_h;
+    int hy_stride  = bi * 4 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
+
+    // initial dropoput
+    std::vector<prngStates> dropout_states_host;
+    std::vector<unsigned char> dropout_reservespace_host;
+    std::vector<T> dropout_hid_state;
+    miopenTensorDescriptor_t dropout_inputTensor{}, dropout_outputTensor{};
+    if(use_dropout)
+    {
+        size_t states_size  = dropoutDesc.stateSizeInBytes / sizeof(prngStates);
+        dropout_states_host = std::vector<prngStates>(states_size);
+        InitKernelStateEmulator(dropout_states_host, dropoutDesc);
+
+        std::array<int, 2> drop_in_len  = {{batch_n, hy_h * bi}};
+        std::array<int, 2> drop_in_str  = {{hy_stride, 1}};
+        std::array<int, 2> drop_out_str = {{hy_h * bi, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenCreateTensorDescriptor(&dropout_outputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+        miopenSetTensorDescriptor(
+            dropout_outputTensor, miopenFloat, 2, drop_in_len.data(), drop_out_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(1));
+
+        dropout_hid_state = std::vector<T>((numlayer - 1) * batch_n * hy_h * bi, static_cast<T>(0));
+    }
+
+    // forward emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        int hid_shift           = li * batch_n * hy_stride;
+        int hx_shift            = li * in_n.at(0) * h_stride;
+        int wei_shift_bias_temp = wei_shift_bias + li * 2 * wei_stride;
+
+        // from input
+        if(li == 0)
+        {
+            if(inputMode == 1)
+            {
+                for(int bs = 0; bs < batch_n; bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        for(int gi = 0; gi < 3; gi++)
+                        {
+                            rsvspace[hid_shift + bs * hy_stride + gi * hy_h + h] +=
+                                in[bs * in_stride + h];
+                            if(bidirection)
+                            {
+                                rsvspace[hid_shift + bs * hy_stride + (gi + 3) * hy_h + h] +=
+                                    in[bs * in_stride + h];
+                            }
+                        }
+                    }
+                }
+
+                // from bias
+                if(biased)
+                {
+                    for(int bs = 0; bs < batch_n; bs++)
+                    {
+                        for(int h = 0; h < wei_stride; h++)
+                        {
+                            rsvspace[hid_shift + bs * hy_stride + h] += wei[wei_shift_bias + h];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                RNN_mm_cpu(in.data(),
+                           in_h,
+                           batch_n,
+                           in_stride,
+                           0,
+                           wei.data(), // wei_state.data(),
+                           in_h,
+                           hy_h * bi * 3,
+                           in_stride,
+                           RNN_MM_TRANSPOSE,
+                           &rsvspace[hid_shift],
+                           hy_h * bi * 3,
+                           batch_n,
+                           hy_stride,
+                           0,
+                           1,
+                           1);
+
+                // from bias
+                if(biased)
+                {
+                    for(int bs = 0; bs < batch_n; bs++)
+                    {
+                        for(int h = 0; h < wei_stride; h++)
+                        {
+                            rsvspace[hid_shift + bs * hy_stride + h] += wei[wei_shift_bias + h];
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
+            int prelayer_shift = (li - 1) * batch_n * hy_stride + bi * 3 * hy_h;
+            if(use_dropout)
+            {
+                auto dropout_states_tmp = dropout_states_host;
+                size_t drop_out_offset  = (static_cast<size_t>(li) - 1) * batch_n * hy_h * bi;
+
+                DropoutForwardVerify<T>(handle,
+                                        dropoutDesc,
+                                        miopen::deref(dropout_inputTensor),
+                                        rsvspace,
+                                        miopen::deref(dropout_outputTensor),
+                                        dropout_hid_state,
+                                        dropout_reservespace_host,
+                                        dropout_states_tmp,
+                                        prelayer_shift,
+                                        drop_out_offset,
+                                        drop_out_offset);
+
+                prelayer_shift = drop_out_offset;
+            }
+
+            RNN_mm_cpu(use_dropout ? &dropout_hid_state[prelayer_shift] : &rsvspace[prelayer_shift],
+                       hy_h * bi,
+                       batch_n,
+                       use_dropout ? hy_h * bi : hy_stride,
+                       0,
+                       &wei[wei_shift], //&wei_state[wei_shift],
+                       hy_h * bi,
+                       hy_h * bi * 3,
+                       bi_stride,
+                       RNN_MM_TRANSPOSE,
+                       &rsvspace[hid_shift],
+                       hy_h * bi * 3,
+                       batch_n,
+                       hy_stride,
+                       0,
+                       1,
+                       1);
+
+            // from bias
+            if(biased)
+            {
+                for(int bs = 0; bs < batch_n; bs++)
+                {
+                    for(int h = 0; h < wei_stride; h++)
+                    {
+                        rsvspace[hid_shift + bs * hy_stride + h] += wei[wei_shift_bias_temp + h];
+                    }
+                }
+            }
+        }
+
+        // from hidden state
+        int bacc   = 0;
+        int baccbi = batch_n;
+        for(int ti = 0; ti < seqLength; ti++)
+        {
+            baccbi -= in_n.at(seqLength - 1 - ti);
+            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+            int pretime_shift;
+
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu(&hx[hx_shift],
+                               hy_h,
+                               in_n.at(ti),
+                               uni_stride,
+                               0,
+                               &wei[wei_shift],
+                               hy_h,
+                               hy_h * 2,
+                               uni_stride,
+                               RNN_MM_TRANSPOSE,
+                               &rsvspace[hid_shift + bacc * hy_stride],
+                               hy_h * 2,
+                               in_n.at(ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        for(int bs = 0; bs < in_n.at(ti); bs++)
+                        {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                for(int gi = 0; gi < 2; gi++)
+                                {
+                                    rsvspace[hid_shift + (bacc + bs) * hy_stride + gi * hy_h + h] +=
+                                        wei[wei_shift_bias_temp + wei_stride + gi * hy_h + h];
+                                }
+                            }
+                        }
+                    }
+
+                    RNN_mm_cpu(&hx[hx_shift],
+                               hy_h,
+                               in_n.at(ti),
+                               uni_stride,
+                               0,
+                               &wei[wei_shift + 2 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h,
+                               uni_stride,
+                               RNN_MM_TRANSPOSE,
+                               &rsvspace[hid_shift + bacc * hy_stride + bi * 3 * hy_h],
+                               hy_h,
+                               in_n.at(ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        for(int bs = 0; bs < in_n.at(ti); bs++)
+                        {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                    wei[wei_shift_bias_temp + wei_stride + 2 * hy_h + h];
+                            }
+                        }
+                    }
+
+                    if(bidirection)
+                    {
+                        RNN_mm_cpu(&hx[hx_shift + hy_n * hy_h],
+                                   hy_h,
+                                   in_n.at(seqLength - 1 - ti),
+                                   uni_stride,
+                                   0,
+                                   &wei[wei_shift + 3 * hy_h * uni_stride],
+                                   hy_h,
+                                   hy_h * 2,
+                                   uni_stride,
+                                   RNN_MM_TRANSPOSE,
+                                   &rsvspace[hid_shift + baccbi * hy_stride + 3 * hy_h],
+                                   hy_h * 2,
+                                   in_n.at(seqLength - 1 - ti),
+                                   hy_stride,
+                                   0,
+                                   1,
+                                   1);
+
+                        if(biased)
+                        {
+                            for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                            {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    for(int gi = 0; gi < 2; gi++)
+                                    {
+                                        rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                 (3 + gi) * hy_h + h] +=
+                                            wei[wei_shift_bias_temp + wei_stride + (3 + gi) * hy_h +
+                                                h];
+                                    }
+                                }
+                            }
+                        }
+
+                        RNN_mm_cpu(&hx[hx_shift + hy_n * hy_h],
+                                   hy_h,
+                                   in_n.at(seqLength - 1 - ti),
+                                   uni_stride,
+                                   0,
+                                   &wei[wei_shift + 5 * hy_h * uni_stride],
+                                   hy_h,
+                                   hy_h,
+                                   uni_stride,
+                                   RNN_MM_TRANSPOSE,
+                                   &rsvspace[hid_shift + baccbi * hy_stride + bi * 3 * hy_h + hy_h],
+                                   hy_h,
+                                   in_n.at(seqLength - 1 - ti),
+                                   hy_stride,
+                                   0,
+                                   1,
+                                   1);
+
+                        if(biased)
+                        {
+                            for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                            {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] +=
+                                        wei[wei_shift_bias_temp + wei_stride + 5 * hy_h + h];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                RNN_mm_cpu(&hy[hx_shift],
+                           hy_h,
+                           in_n.at(ti),
+                           uni_stride,
+                           0,
+                           &wei[wei_shift],
+                           hy_h,
+                           hy_h * 2,
+                           uni_stride,
+                           RNN_MM_TRANSPOSE,
+                           &rsvspace[hid_shift + bacc * hy_stride],
+                           hy_h * 2,
+                           in_n.at(ti),
+                           hy_stride,
+                           0,
+                           1,
+                           1);
+
+                if(biased)
+                {
+                    for(int bs = 0; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            for(int gi = 0; gi < 2; gi++)
+                            {
+                                rsvspace[hid_shift + (bacc + bs) * hy_stride + gi * hy_h + h] +=
+                                    wei[wei_shift_bias_temp + wei_stride + gi * hy_h + h];
+                            }
+                        }
+                    }
+                }
+
+                RNN_mm_cpu(&hy[hx_shift],
+                           hy_h,
+                           in_n.at(ti),
+                           uni_stride,
+                           0,
+                           &wei[wei_shift + 2 * hy_h * uni_stride],
+                           hy_h,
+                           hy_h,
+                           uni_stride,
+                           RNN_MM_TRANSPOSE,
+                           &rsvspace[hid_shift + bacc * hy_stride + bi * 3 * hy_h],
+                           hy_h,
+                           in_n.at(ti),
+                           hy_stride,
+                           0,
+                           1,
+                           1);
+
+                if(biased)
+                {
+                    for(int bs = 0; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                wei[wei_shift_bias_temp + wei_stride + 2 * hy_h + h];
+                        }
+                    }
+                }
+
+                if(bidirection)
+                {
+
+                    if(!hx_is_null && in_n.at(seqLength - 1 - ti) > in_n.at(seqLength - ti))
+                    {
+                        RNN_mm_cpu(
+                            &hx[hx_shift + hy_n * hy_h + in_n.at(seqLength - ti) * hy_h],
+                            hy_h,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            uni_stride,
+                            0,
+                            &wei[wei_shift + 3 * hy_h * uni_stride],
+                            hy_h,
+                            hy_h * 2,
+                            uni_stride,
+                            RNN_MM_TRANSPOSE,
+                            &rsvspace[hid_shift + (baccbi + in_n.at(seqLength - ti)) * hy_stride +
+                                      3 * hy_h],
+                            hy_h * 2,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            hy_stride,
+                            0,
+                            1,
+                            1);
+
+                        if(biased)
+                        {
+                            for(int bs = in_n.at(seqLength - ti); bs < in_n.at(seqLength - 1 - ti);
+                                bs++)
+                            {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    for(int gi = 0; gi < 2; gi++)
+                                    {
+                                        rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                 (3 + gi) * hy_h + h] +=
+                                            wei[wei_shift_bias_temp + wei_stride + (3 + gi) * hy_h +
+                                                h];
+                                    }
+                                }
+                            }
+                        }
+
+                        RNN_mm_cpu(
+                            &hx[hx_shift + hy_n * hy_h + in_n.at(seqLength - ti) * hy_h],
+                            hy_h,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            uni_stride,
+                            0,
+                            &wei[wei_shift + 5 * hy_h * uni_stride],
+                            hy_h,
+                            hy_h,
+                            uni_stride,
+                            RNN_MM_TRANSPOSE,
+                            &rsvspace[hid_shift + (baccbi + in_n.at(seqLength - ti)) * hy_stride +
+                                      bi * 3 * hy_h + hy_h],
+                            hy_h,
+                            (in_n.at(seqLength - 1 - ti) - in_n.at(seqLength - ti)),
+                            hy_stride,
+                            0,
+                            1,
+                            1);
+
+                        if(biased)
+                        {
+                            for(int bs = in_n.at(seqLength - ti); bs < in_n.at(seqLength - 1 - ti);
+                                bs++)
+                            {
+                                for(int h = 0; h < hy_h; h++)
+                                {
+                                    rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] +=
+                                        wei[wei_shift_bias_temp + wei_stride + 5 * hy_h + h];
+                                }
+                            }
+                        }
+                    }
+
+                    RNN_mm_cpu(&hy[hx_shift + hy_n * hy_h],
+                               hy_h,
+                               in_n.at(seqLength - ti),
+                               uni_stride,
+                               0,
+                               &wei[wei_shift + 3 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h * 2,
+                               uni_stride,
+                               RNN_MM_TRANSPOSE,
+                               &rsvspace[hid_shift + baccbi * hy_stride + 3 * hy_h],
+                               hy_h * 2,
+                               in_n.at(seqLength - ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        for(int bs = 0; bs < in_n.at(seqLength - ti); bs++)
+                        {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                for(int gi = 0; gi < 2; gi++)
+                                {
+                                    rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                             (3 + gi) * hy_h + h] +=
+                                        wei[wei_shift_bias_temp + wei_stride + (3 + gi) * hy_h + h];
+                                }
+                            }
+                        }
+                    }
+
+                    RNN_mm_cpu(&hy[hx_shift + hy_n * hy_h],
+                               hy_h,
+                               in_n.at(seqLength - ti),
+                               uni_stride,
+                               0,
+                               &wei[wei_shift + 5 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h,
+                               uni_stride,
+                               RNN_MM_TRANSPOSE,
+                               &rsvspace[hid_shift + baccbi * hy_stride + bi * 3 * hy_h + hy_h],
+                               hy_h,
+                               in_n.at(seqLength - ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        for(int bs = 0; bs < in_n.at(seqLength - ti); bs++)
+                        {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] +=
+                                    wei[wei_shift_bias_temp + wei_stride + 5 * hy_h + h];
+                            }
+                        }
+                    }
+                }
+            }
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h +
+                             numlayer * batch_n * hy_stride] =
+                        rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h];
+
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h] +=
+                        activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h], 2) *
+                        rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h];
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] = 0;
+
+                    if(ti == 0)
+                    {
+                        if(!hx_is_null)
+                        {
+                            rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                ((1 -
+                                  activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2)) *
+                                     activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride +
+                                                        2 * hy_h + h],
+                                               1) +
+                                 activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2) *
+                                     hx[hx_shift + bs * uni_stride + h]);
+                        }
+                        else
+                        {
+                            rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                ((1 -
+                                  activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2)) *
+                                 activfunc(
+                                     rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h],
+                                     1));
+                        }
+                    }
+                    else
+                    {
+
+                        pretime_shift = li * batch_n * hy_stride +
+                                        (bacc - in_n.at(ti - 1)) * hy_stride + bi * 3 * hy_h;
+
+                        rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                            ((1 - activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2)) *
+                                 activfunc(
+                                     rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h],
+                                     1) +
+                             activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2) *
+                                 rsvspace[pretime_shift + bs * hy_stride + h]);
+                    }
+
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + h +
+                             numlayer * batch_n * hy_stride] =
+                        activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2);
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h +
+                             numlayer * batch_n * hy_stride] =
+                        activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h], 2);
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h +
+                             numlayer * batch_n * hy_stride] =
+                        activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h], 1);
+
+                    // Update final state
+                    hy[hx_shift + bs * uni_stride + h] =
+                        rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h];
+                }
+            }
+
+            if(bidirection)
+            {
+                pretime_shift = li * batch_n * hy_stride +
+                                (baccbi + in_n.at(seqLength - 1 - ti)) * hy_stride + bi * 3 * hy_h +
+                                hy_h;
+
+                for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h + h +
+                                 numlayer * batch_n * hy_stride] =
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                     h];
+
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h] +=
+                            activfunc(
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h], 2) *
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                     h];
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h + h] =
+                            0;
+
+                        if(ti == 0)
+                        {
+                            if(!hx_is_null)
+                            {
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] +=
+                                    ((1 - activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                             3 * hy_h + h],
+                                                    2)) *
+                                         activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                            5 * hy_h + h],
+                                                   1) +
+                                     activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                        3 * hy_h + h],
+                                               2) *
+                                         hx[hx_shift + bs * uni_stride + hy_n * hy_h + h]);
+                            }
+                            else
+                            {
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] +=
+                                    ((1 - activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                             3 * hy_h + h],
+                                                    2)) *
+                                     activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                        5 * hy_h + h],
+                                               1));
+                            }
+                        }
+                        else
+                        {
+                            if(!hx_is_null && in_n.at(seqLength - 1 - ti) > in_n.at(seqLength - ti))
+                            {
+                                if(bs >= in_n.at(seqLength - ti))
+                                {
+                                    rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] +=
+                                        (activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                            3 * hy_h + h],
+                                                   2) *
+                                         hx[hx_shift + bs * uni_stride + hy_n * hy_h + h]);
+                                }
+                            }
+
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                     h] +=
+                                ((1 - activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                         3 * hy_h + h],
+                                                2)) *
+                                 activfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h],
+                                     1));
+
+                            if(bs < in_n.at(seqLength - ti))
+                            {
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] +=
+                                    (activfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                        3 * hy_h + h],
+                                               2) *
+                                     rsvspace[pretime_shift + bs * hy_stride + h]);
+                            }
+                        }
+
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h +
+                                 numlayer * batch_n * hy_stride] =
+                            activfunc(
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h], 2);
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h +
+                                 numlayer * batch_n * hy_stride] =
+                            activfunc(
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h], 2);
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h +
+                                 numlayer * batch_n * hy_stride] =
+                            activfunc(
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h], 1);
+
+                        // Update final hidden state
+                        hy[hx_shift + bs * uni_stride + hy_n * hy_h + h] =
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                     h];
+                    }
+                }
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+
+    // output
+    int prelayer_shift = (numlayer - 1) * batch_n * hy_stride + bi * 3 * hy_h;
+    for(int bs = 0; bs < batch_n; bs++)
+    {
+        for(int h = 0; h < out_h; h++)
+        {
+            out[bs * out_stride + h] = rsvspace[prelayer_shift + bs * hy_stride + h];
+        }
+    }
+
+    if(use_dropout)
+    {
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                rsvspace.at(dropout_offset + i) = dropout_hid_state.at(i);
+            }
+
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                *(p_drop_rsv + i) = dropout_reservespace_host.at(i);
+            }
+        }
+    }
+}
+
+template <typename T>
+void GRUBwdDataCPUVerify(bool use_dropout,
+                         const miopen::DropoutDescriptor& dropoutDesc,
+                         std::vector<T>& din,
+                         const std::vector<T>& wei, // [ input_state_weight_trans
+                                                    // hidden_state_weight0_trans input1_trans
+                                                    // hidden1_trans ... output_weight;
+                                                    // bidirectional reversed weights ]
+                         const std::vector<T>& dhy, // current/final hidden state
+                         std::vector<T>& dhx,
+                         const std::vector<T>& hx, // initial hidden state
+                         const std::vector<T>&,
+                         const std::vector<T>& dout,
+                         const std::vector<int>& in_n, // input batch size
+                         int in_h,                     // input data length
+                         int seqLength,                // Number of iterations to unroll over
+                         bool bidirection,             // whether using bidirectional net
+                         bool,                         // whether using bias
+                         int hy_d,  // 1 by numlayer (number of stacks of hidden layers)
+                                    // for unidirection, 2 by numlayer for bidirection
+                         int hy_n,  // equal to input batch size in_n[0]
+                         int hy_h,  // hidden state number
+                         int out_h, // 1 by hy_h related function for unidirection, 2 by
+                                    // hy_h related function for bidirection
+                         int inputMode,
+                         std::vector<T>& rsvspace,
+                         std::vector<T>& wkspace,
+                         bool hx_is_null  = false,
+                         bool dhy_is_null = false)
+{
+    int batch_n = sumvc(in_n);
+
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int out_stride = out_h;
+    int wei_stride = bi * 3 * hy_h;
+    int hy_stride  = bi * 4 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    // initial hidden states
+    auto ihs = hy_d * hy_n * hy_h;
+    std::vector<T> dcx(ihs);
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    // initial dropoput
+    miopenTensorDescriptor_t dropout_inputTensor{};
+    std::vector<unsigned char> dropout_reservespace_host;
+    if(use_dropout)
+    {
+        std::array<int, 2> drop_in_len = {{batch_n, hy_h * bi}};
+        std::array<int, 2> drop_in_str = {{hy_stride, 1}};
+        miopenCreateTensorDescriptor(&dropout_inputTensor);
+        miopenSetTensorDescriptor(
+            dropout_inputTensor, miopenFloat, 2, drop_in_len.data(), drop_in_str.data());
+
+        size_t reserveSpaceSizeInBytes = 0;
+        miopenDropoutGetReserveSpaceSize(dropout_inputTensor, &reserveSpaceSizeInBytes);
+        size_t reserve_size       = reserveSpaceSizeInBytes / sizeof(unsigned char);
+        dropout_reservespace_host = std::vector<unsigned char>(reserve_size * (numlayer - 1),
+                                                               static_cast<unsigned char>(0));
+
+        const size_t dropout_size   = static_cast<size_t>(numlayer - 1) * batch_n * hy_h * bi;
+        const size_t dropout_offset = static_cast<size_t>(numlayer) * batch_n * hy_stride * 2;
+        if(dropout_size > 0)
+        {
+            auto p_drop_rsv =
+                reinterpret_cast<unsigned char*>(&rsvspace.at(dropout_offset + dropout_size));
+            for(size_t i = 0; i < dropout_size; i++)
+            {
+                dropout_reservespace_host.at(i) = *(p_drop_rsv + i);
+            }
+        }
+    }
+
+    // bwd data emulator
+    for(int li = numlayer - 1; li >= 0; li--)
+    {
+        int wei_shift     = (in_h + hy_h) * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+        int hid_shift     = li * batch_n * hy_stride;
+        int hx_shift      = li * in_n.at(0) * h_stride;
+        int weitime_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+
+        if(li == numlayer - 1)
+        {
+            for(int bs = 0; bs < batch_n; bs++)
+            {
+                for(int h = 0; h < out_h; h++)
+                {
+                    wkspace[hid_shift + bi * 3 * hy_h + bs * hy_stride + h] +=
+                        dout[bs * out_stride + h];
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift = (li + 1) * batch_n * hy_stride;
+
+            RNN_mm_cpu(&wkspace[prelayer_shift],
+                       hy_h * bi * 3,
+                       batch_n,
+                       hy_stride,
+                       0,
+                       &wei[wei_shift],
+                       hy_h * bi,
+                       hy_h * bi * 3,
+                       bi_stride,
+                       0,
+                       &wkspace[hid_shift + bi * 3 * hy_h],
+                       hy_h * bi,
+                       batch_n,
+                       hy_stride,
+                       0,
+                       1,
+                       1);
+
+            if(use_dropout)
+            {
+                DropoutBackwardVerify<T>(dropoutDesc,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         miopen::deref(dropout_inputTensor),
+                                         wkspace,
+                                         dropout_reservespace_host,
+                                         hid_shift + bi * 3 * hy_h,
+                                         hid_shift + bi * 3 * hy_h,
+                                         li * batch_n * hy_h * bi);
+            }
+        }
+
+        // from hidden state
+        int bacc   = batch_n;
+        int baccbi = 0;
+        for(int ti = seqLength - 1; ti >= 0; ti--)
+        {
+            bacc -= in_n.at(ti);
+
+            if(ti == seqLength - 1)
+            {
+                if(!dhy_is_null)
+                {
+                    for(int bs = 0; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                dhy[hx_shift + bs * uni_stride + h];
+                        }
+                    }
+
+                    if(bidirection)
+                    {
+                        for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                        {
+                            for(int h = 0; h < hy_h; h++)
+                            {
+                                wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                        hy_h + h] +=
+                                    dhy[hx_shift + bs * uni_stride + hy_n * hy_h + h];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if(!dhy_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                {
+                    for(int bs = in_n.at(ti + 1); bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                                dhy[hx_shift + bs * uni_stride + h];
+                        }
+                    }
+                }
+
+                int pretime_shift = li * batch_n * hy_stride + (bacc + in_n.at(ti)) * hy_stride;
+
+                RNN_mm_cpu(&wkspace[pretime_shift],
+                           hy_h * 2,
+                           in_n.at(ti + 1),
+                           hy_stride,
+                           0,
+                           &wei[weitime_shift],
+                           hy_h,
+                           hy_h * 2,
+                           uni_stride,
+                           0,
+                           &wkspace[hid_shift + bacc * hy_stride + bi * 3 * hy_h],
+                           hy_h,
+                           in_n.at(ti + 1),
+                           hy_stride,
+                           0,
+                           1,
+                           1);
+
+                for(int bs = 0; bs < in_n.at(ti + 1); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] +=
+                            wkspace[pretime_shift + bs * hy_stride + bi * 3 * hy_h + h] *
+                            activfunc(rsvspace[pretime_shift + bs * hy_stride + h], 2);
+
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h] =
+                            wkspace[pretime_shift + bs * hy_stride + 2 * hy_h + h] *
+                            activfunc(rsvspace[pretime_shift + bs * hy_stride + hy_h + h], 2);
+                    }
+                }
+
+                RNN_mm_cpu(&wkspace[hid_shift + bacc * hy_stride + 2 * hy_h],
+                           hy_h,
+                           in_n.at(ti + 1),
+                           hy_stride,
+                           0,
+                           &wei[weitime_shift + 2 * hy_h * uni_stride],
+                           hy_h,
+                           hy_h,
+                           uni_stride,
+                           0,
+                           &wkspace[hid_shift + bacc * hy_stride + bi * 3 * hy_h],
+                           hy_h,
+                           in_n.at(ti + 1),
+                           hy_stride,
+                           0,
+                           1,
+                           1);
+
+                for(int bs = 0; bs < in_n.at(ti + 1); bs++)
+                {
+                    auto subidx = hid_shift + (bacc + bs) * hy_stride + 2 * hy_h;
+                    std::fill(wkspace.begin() + subidx, wkspace.begin() + subidx + hy_h, 0);
+                }
+
+                if(bidirection)
+                {
+                    pretime_shift = li * batch_n * hy_stride +
+                                    (baccbi - in_n.at(seqLength - 2 - ti)) * hy_stride + hy_h * 3;
+
+                    RNN_mm_cpu(&wkspace[pretime_shift],
+                               hy_h * 2,
+                               in_n.at(seqLength - 1 - ti),
+                               hy_stride,
+                               0,
+                               &wei[weitime_shift + hy_h * 3 * uni_stride],
+                               hy_h,
+                               hy_h * 2,
+                               uni_stride,
+                               0,
+                               &wkspace[hid_shift + baccbi * hy_stride + bi * 3 * hy_h + hy_h],
+                               hy_h,
+                               in_n.at(seqLength - 1 - ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                    h] +=
+                                wkspace[pretime_shift + bs * hy_stride + 3 * hy_h + hy_h + h] *
+                                activfunc(rsvspace[pretime_shift + bs * hy_stride + h], 2);
+
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h] =
+                                wkspace[pretime_shift + bs * hy_stride + 2 * hy_h + h] *
+                                activfunc(rsvspace[pretime_shift + bs * hy_stride + hy_h + h], 2);
+                        }
+                    }
+
+                    RNN_mm_cpu(&wkspace[hid_shift + baccbi * hy_stride + 5 * hy_h],
+                               hy_h,
+                               in_n.at(seqLength - 1 - ti),
+                               hy_stride,
+                               0,
+                               &wei[weitime_shift + 5 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h,
+                               uni_stride,
+                               0,
+                               &wkspace[hid_shift + baccbi * hy_stride + bi * 3 * hy_h + hy_h],
+                               hy_h,
+                               in_n.at(seqLength - 1 - ti),
+                               hy_stride,
+                               0,
+                               1,
+                               1);
+
+                    for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                    {
+                        auto subidx = hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h;
+                        std::fill(wkspace.begin() + subidx, wkspace.begin() + (subidx + hy_h), 0);
+                    }
+                }
+            }
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    wkspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h] +=
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] *
+                        (1 - activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2)) *
+                        dervactivfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h],
+                                      1);
+
+                    wkspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h] =
+                        (rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h +
+                                  numlayer * batch_n * hy_stride] *
+                         wkspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h] *
+                         dervactivfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h],
+                                       2));
+
+                    if(ti == 0)
+                    {
+                        if(!hx_is_null)
+                        {
+                            wkspace[hid_shift + (bacc + bs) * hy_stride + h] +=
+                                (wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] *
+                                 hx[hx_shift + bs * uni_stride + h] *
+                                 dervactivfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h],
+                                               2));
+                        }
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + h] -=
+                            (wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] *
+                             activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h],
+                                       1) *
+                             dervactivfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2));
+                    }
+                    else
+                    {
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + h] +=
+                            wkspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h] *
+                            (rsvspace[hid_shift + (bacc - in_n.at(ti - 1) + bs) * hy_stride +
+                                      bi * 3 * hy_h + h] -
+                             activfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h],
+                                       1)) *
+                            dervactivfunc(rsvspace[hid_shift + (bacc + bs) * hy_stride + h], 2);
+                    }
+
+                    rsvspace[hid_shift + (bacc + bs) * hy_stride + bi * 3 * hy_h + h +
+                             numlayer * batch_n * hy_stride] =
+                        wkspace[hid_shift + (bacc + bs) * hy_stride + 2 * hy_h + h] *
+                        rsvspace[hid_shift + (bacc + bs) * hy_stride + hy_h + h +
+                                 numlayer * batch_n * hy_stride];
+                }
+            }
+
+            if(bidirection)
+            {
+                for(int bs = 0; bs < in_n.at(seqLength - 1 - ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        wkspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h] +=
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                    h] *
+                            (1 - activfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h],
+                                     2)) *
+                            dervactivfunc(
+                                rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h], 1);
+
+                        wkspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h] =
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h +
+                                     h + numlayer * batch_n * hy_stride];
+
+                        wkspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h] *=
+                            (wkspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h] *
+                             dervactivfunc(
+                                 rsvspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h],
+                                 2));
+
+                        if(ti == 0)
+                        {
+                            if(!hx_is_null)
+                            {
+                                wkspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h] +=
+                                    (wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] *
+                                     hx[hx_shift + bs * uni_stride + hy_n * hy_h + h] *
+                                     dervactivfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                            3 * hy_h + h],
+                                                   2));
+                            }
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h] -=
+                                (wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] *
+                                 activfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h],
+                                     1) *
+                                 dervactivfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h],
+                                     2));
+                        }
+                        else
+                        {
+                            if(!hx_is_null &&
+                               in_n.at(seqLength - 1 - ti) > in_n.at(seqLength - ti) &&
+                               bs >= in_n.at(seqLength - ti))
+                            {
+                                wkspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h] +=
+                                    (wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] *
+                                     hx[hx_shift + bs * uni_stride + hy_n * hy_h + h] *
+                                     dervactivfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                            3 * hy_h + h],
+                                                   2));
+                            }
+
+                            if(bs < in_n.at(seqLength - ti))
+                            {
+                                wkspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h] +=
+                                    (wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                             hy_h + h] *
+                                     rsvspace[hid_shift +
+                                              (baccbi + in_n.at(seqLength - 1 - ti) + bs) *
+                                                  hy_stride +
+                                              bi * 3 * hy_h + hy_h + h] *
+                                     dervactivfunc(rsvspace[hid_shift + (baccbi + bs) * hy_stride +
+                                                            3 * hy_h + h],
+                                                   2));
+                            }
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h] -=
+                                (wkspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h +
+                                         hy_h + h] *
+                                 activfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h],
+                                     1) *
+                                 dervactivfunc(
+                                     rsvspace[hid_shift + (baccbi + bs) * hy_stride + 3 * hy_h + h],
+                                     2));
+                        }
+
+                        rsvspace[hid_shift + (baccbi + bs) * hy_stride + bi * 3 * hy_h + hy_h + h +
+                                 numlayer * batch_n * hy_stride] =
+                            wkspace[hid_shift + (baccbi + bs) * hy_stride + 5 * hy_h + h] *
+                            rsvspace[hid_shift + (baccbi + bs) * hy_stride + 4 * hy_h + h +
+                                     numlayer * batch_n * hy_stride];
+                    }
+                }
+            }
+
+            baccbi += in_n.at(seqLength - 1 - ti);
+        }
+
+        // dhx
+        int pretime_shift = li * batch_n * hy_stride;
+
+        RNN_mm_cpu(&wkspace[pretime_shift],
+                   hy_h * 2,
+                   in_n.at(0),
+                   hy_stride,
+                   0,
+                   &wei[weitime_shift],
+                   hy_h,
+                   hy_h * 2,
+                   uni_stride,
+                   0,
+                   &dhx[hx_shift],
+                   hy_h,
+                   in_n.at(0),
+                   uni_stride,
+                   0,
+                   1,
+                   1);
+
+        for(int bs = 0; bs < in_n.at(0); bs++)
+        {
+            for(int h = 0; h < hy_h; h++)
+            {
+                dhx[hx_shift + bs * uni_stride + h] +=
+                    wkspace[pretime_shift + bs * hy_stride + bi * 3 * hy_h + h] *
+                    activfunc(rsvspace[pretime_shift + bs * hy_stride + h], 2);
+
+                dcx[hx_shift + bs * uni_stride + h] =
+                    wkspace[pretime_shift + bs * hy_stride + 2 * hy_h + h] *
+                    activfunc(rsvspace[pretime_shift + bs * hy_stride + hy_h + h], 2);
+            }
+        }
+
+        RNN_mm_cpu(&dcx[hx_shift],
+                   hy_h,
+                   in_n.at(0),
+                   uni_stride,
+                   0,
+                   &wei[weitime_shift + 2 * hy_h * uni_stride],
+                   hy_h,
+                   hy_h,
+                   uni_stride,
+                   0,
+                   &dhx[hx_shift],
+                   hy_h,
+                   in_n.at(0),
+                   uni_stride,
+                   0,
+                   1,
+                   1);
+
+        if(bidirection)
+        {
+            int ti = seqLength - 1, cur_bat = 0, pre_bat = batch_n;
+
+            while(ti >= 0)
+            {
+                pre_bat -= in_n.at(ti);
+                if(in_n.at(ti) > cur_bat)
+                {
+                    pretime_shift = li * batch_n * hy_stride + (pre_bat + cur_bat) * hy_stride;
+
+                    RNN_mm_cpu(&wkspace[pretime_shift + 3 * hy_h],
+                               hy_h * 2,
+                               (in_n.at(ti) - cur_bat),
+                               hy_stride,
+                               0,
+                               &wei[weitime_shift + 3 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h * 2,
+                               uni_stride,
+                               0,
+                               &dhx[hx_shift + hy_n * hy_h + cur_bat * hy_h],
+                               hy_h,
+                               (in_n.at(ti) - cur_bat),
+                               uni_stride,
+                               0,
+                               1,
+                               1);
+
+                    for(int bs = cur_bat; bs < in_n.at(ti); bs++)
+                    {
+                        for(int h = 0; h < hy_h; h++)
+                        {
+                            dhx[hx_shift + bs * uni_stride + hy_n * hy_h + h] +=
+                                wkspace[pretime_shift + (bs - cur_bat) * hy_stride + bi * 3 * hy_h +
+                                        hy_h + h] *
+                                activfunc(rsvspace[pretime_shift + (bs - cur_bat) * hy_stride +
+                                                   3 * hy_h + h],
+                                          2);
+
+                            dcx[hx_shift + bs * uni_stride + hy_n * hy_h + h] =
+                                wkspace[pretime_shift + (bs - cur_bat) * hy_stride + 5 * hy_h + h] *
+                                activfunc(rsvspace[pretime_shift + (bs - cur_bat) * hy_stride +
+                                                   4 * hy_h + h],
+                                          2);
+                        }
+                    }
+
+                    RNN_mm_cpu(&dcx[hx_shift + hy_n * hy_h + cur_bat * hy_h],
+                               hy_h,
+                               (in_n.at(ti) - cur_bat),
+                               uni_stride,
+                               0,
+                               &wei[weitime_shift + 5 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h,
+                               uni_stride,
+                               0,
+                               &dhx[hx_shift + hy_n * hy_h + cur_bat * hy_h],
+                               hy_h,
+                               (in_n.at(ti) - cur_bat),
+                               uni_stride,
+                               0,
+                               1,
+                               1);
+                }
+                cur_bat = in_n.at(ti--);
+            }
+        }
+    }
+
+    // dinput
+    if(inputMode == 1)
+    {
+        for(int bs = 0; bs < batch_n; bs++)
+        {
+            for(int h = 0; h < hy_h; h++)
+            {
+                for(int gi = 0; gi < 3; gi++)
+                {
+                    din[bs * in_stride + h] += wkspace[bs * hy_stride + gi * hy_h + h];
+                    if(bidirection)
+                    {
+                        din[bs * in_stride + h] += wkspace[bs * hy_stride + (gi + 3) * hy_h + h];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        RNN_mm_cpu(wkspace.data(),
+                   hy_h * bi * 3,
+                   batch_n,
+                   hy_stride,
+                   0,
+                   wei.data(),
+                   in_h,
+                   hy_h * bi * 3,
+                   in_stride,
+                   0,
+                   din.data(),
+                   in_h,
+                   batch_n,
+                   in_stride,
+                   0,
+                   1,
+                   1);
+    }
+}
+
+template <typename T>
+void GRUBwdWeightCPUVerify(bool use_dropout,
+                           const std::vector<T>& in,
+                           std::vector<T>& dwei,         // [ input_state_weight_trans
+                                                         // hidden_state_weight0_trans
+                                                         // input1_trans hidden1_trans ...
+                                                         // output_weight; bidirectional
+                                                         // reversed weights ]
+                           const std::vector<T>& hx,     // initial hidden state
+                           const std::vector<int>& in_n, // input batch size
+                           int in_h,                     // input data length
+                           int seqLength,                // Number of iterations to unroll over
+                           bool bidirection,             // whether using bidirectional net
+                           bool biased,                  // whether using bias
+                           int hy_d, // 1 by numlayer (number of stacks of hidden
+                                     // layers) for unidirection, 2 by numlayer for
+                                     // bidirection
+                           int hy_n, // equal to input batch size in_n[0]
+                           int hy_h, // hidden state number
+                                     // by hy_h related function for bidirection
+                           int inputMode,
+                           const std::vector<T>& rsvspace,
+                           std::vector<T>& wkspace,
+                           bool hx_is_null = false)
+{
+    int batch_n  = sumvc(in_n);
+    int numlayer = bidirection ? hy_d / 2 : hy_d;
+    int bi       = bidirection ? 2 : 1;
+
+    int in_stride  = in_h;
+    int wei_stride = bi * 3 * hy_h;
+    int hy_stride  = bi * 4 * hy_h;
+    int h_stride   = bi * hy_h;
+    int uni_stride = hy_h;
+    int bi_stride  = hy_h * bi;
+
+    if(inputMode == 1)
+    {
+        if(in_h != hy_h)
+        {
+            std::cout
+                << "Verification cannot be completed: The input tensor size must equal to the "
+                << "hidden state size of the network in SKIP_INPUT mode!" << std::endl;
+            return;
+        }
+        in_h = 0;
+    }
+
+    int wei_shift_bias = (in_h + hy_h + (bi * hy_h + hy_h) * (numlayer - 1)) * wei_stride;
+
+    // bwd weights emulator
+    for(int li = 0; li < numlayer; li++)
+    {
+        // between layers
+        if(li == 0)
+        {
+            if(inputMode == 0)
+            {
+                RNN_mm_cpu(wkspace.data(),
+                           hy_h * bi * 3,
+                           batch_n,
+                           hy_stride,
+                           RNN_MM_TRANSPOSE,
+                           in.data(),
+                           in_h,
+                           batch_n,
+                           in_stride,
+                           0,
+                           dwei.data(),
+                           in_h,
+                           hy_h * bi * 3,
+                           in_stride,
+                           0,
+                           1,
+                           1);
+            }
+
+            if(biased)
+            {
+                for(int h = 0; h < wei_stride; h++)
+                {
+                    for(int w = 0; w < batch_n; w++)
+                    {
+                        dwei[wei_shift_bias + h] += wkspace[w * hy_stride + h];
+                    }
+                }
+            }
+        }
+        else
+        {
+            int prelayer_shift =
+                use_dropout ? 2 * numlayer * batch_n * hy_stride + (li - 1) * batch_n * hy_h * bi
+                            : (li - 1) * batch_n * hy_stride + bi * hy_h * 3;
+            int hid_shift = li * batch_n * hy_stride;
+            int wei_shift = (in_h + hy_h) * wei_stride + (li - 1) * (bi * hy_h + hy_h) * wei_stride;
+
+            RNN_mm_cpu(&wkspace[hid_shift],
+                       hy_h * bi * 3,
+                       batch_n,
+                       hy_stride,
+                       RNN_MM_TRANSPOSE,
+                       &rsvspace[prelayer_shift],
+                       hy_h * bi,
+                       batch_n,
+                       use_dropout ? hy_h * bi : hy_stride,
+                       0,
+                       &dwei[wei_shift],
+                       hy_h * bi,
+                       hy_h * bi * 3,
+                       bi_stride,
+                       0,
+                       1,
+                       1);
+
+            if(biased)
+            {
+                wei_shift = wei_shift_bias + li * 2 * wei_stride;
+
+                for(int h = 0; h < wei_stride; h++)
+                {
+                    for(int w = 0; w < batch_n; w++)
+                    {
+                        dwei[wei_shift + h] += wkspace[hid_shift + w * hy_stride + h];
+                    }
+                }
+            }
+        }
+
+        // between time
+        int bacc = 0;
+        for(int ti = 0; ti < seqLength; ti++)
+        {
+            int hid_shift = li * batch_n * hy_stride + bacc * hy_stride;
+            int hx_shift  = li * in_n.at(0) * h_stride;
+            int wei_shift = in_h * wei_stride + li * (bi * hy_h + hy_h) * wei_stride;
+            int pretime_shift;
+
+            for(int bs = 0; bs < in_n.at(ti); bs++)
+            {
+                for(int h = 0; h < hy_h; h++)
+                {
+                    wkspace[hid_shift + bs * hy_stride + 2 * hy_h + h] *=
+                        activfunc(rsvspace[hid_shift + bs * hy_stride + hy_h + h], 2);
+                }
+            }
+
+            // between time
+            if(ti == 0)
+            {
+                if(!hx_is_null)
+                {
+                    RNN_mm_cpu(&wkspace[hid_shift],
+                               hy_h * 3,
+                               in_n.at(ti),
+                               hy_stride,
+                               RNN_MM_TRANSPOSE,
+                               &hx[hx_shift],
+                               hy_h,
+                               in_n.at(ti),
+                               uni_stride,
+                               0,
+                               &dwei[wei_shift],
+                               hy_h,
+                               hy_h * 3,
+                               uni_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                        for(int h = 0; h < hy_h * 3; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti); w++)
+                            {
+                                dwei[bias_shift + h] += wkspace[hid_shift + w * hy_stride + h];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                pretime_shift =
+                    li * batch_n * hy_stride + (bacc - in_n.at(ti - 1)) * hy_stride + bi * 3 * hy_h;
+
+                RNN_mm_cpu(&wkspace[hid_shift],
+                           hy_h * 3,
+                           in_n.at(ti),
+                           hy_stride,
+                           RNN_MM_TRANSPOSE,
+                           &rsvspace[pretime_shift],
+                           hy_h,
+                           in_n.at(ti),
+                           hy_stride,
+                           0,
+                           &dwei[wei_shift],
+                           hy_h,
+                           hy_h * 3,
+                           uni_stride,
+                           0,
+                           1,
+                           1);
+
+                if(biased)
+                {
+                    int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                    for(int h = 0; h < hy_h * 3; h++)
+                    {
+                        for(int w = 0; w < in_n.at(ti); w++)
+                        {
+                            dwei[bias_shift + h] += wkspace[hid_shift + w * hy_stride + h];
+                        }
+                    }
+                }
+            }
+
+            if(bidirection)
+            {
+                for(int bs = 0; bs < in_n.at(ti); bs++)
+                {
+                    for(int h = 0; h < hy_h; h++)
+                    {
+                        wkspace[hid_shift + bs * hy_stride + 5 * hy_h + h] *=
+                            activfunc(rsvspace[hid_shift + bs * hy_stride + 4 * hy_h + h], 2);
+                    }
+                }
+
+                if(ti == seqLength - 1)
+                {
+                    if(!hx_is_null)
+                    {
+                        RNN_mm_cpu(&wkspace[hid_shift + 3 * hy_h],
+                                   hy_h * 3,
+                                   in_n.at(ti),
+                                   hy_stride,
+                                   RNN_MM_TRANSPOSE,
+                                   &hx[hx_shift + hy_n * hy_h],
+                                   hy_h,
+                                   in_n.at(ti),
+                                   uni_stride,
+                                   0,
+                                   &dwei[wei_shift + 3 * hy_h * uni_stride],
+                                   hy_h,
+                                   hy_h * 3,
+                                   uni_stride,
+                                   0,
+                                   1,
+                                   1);
+
+                        if(biased)
+                        {
+                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                            for(int h = 0; h < hy_h * 3; h++)
+                            {
+                                for(int w = 0; w < in_n.at(ti); w++)
+                                {
+                                    dwei[bias_shift + 3 * hy_h + h] +=
+                                        wkspace[hid_shift + 3 * hy_h + w * hy_stride + h];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if(!hx_is_null && in_n.at(ti) > in_n.at(ti + 1))
+                    {
+                        RNN_mm_cpu(&wkspace[hid_shift + 3 * hy_h + in_n.at(ti + 1) * hy_stride],
+                                   hy_h * 3,
+                                   (in_n.at(ti) - in_n.at(ti + 1)),
+                                   hy_stride,
+                                   RNN_MM_TRANSPOSE,
+                                   &hx[hx_shift + hy_n * hy_h + in_n.at(ti + 1) * hy_h],
+                                   hy_h,
+                                   (in_n.at(ti) - in_n.at(ti + 1)),
+                                   uni_stride,
+                                   0,
+                                   &dwei[wei_shift + 3 * hy_h * uni_stride],
+                                   hy_h,
+                                   hy_h * 3,
+                                   uni_stride,
+                                   0,
+                                   1,
+                                   1);
+
+                        if(biased)
+                        {
+                            int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                            for(int h = 0; h < hy_h * 3; h++)
+                            {
+                                for(int w = in_n.at(ti + 1); w < in_n.at(ti); w++)
+                                {
+                                    dwei[bias_shift + 3 * hy_h + h] +=
+                                        wkspace[hid_shift + 3 * hy_h + w * hy_stride + h];
+                                }
+                            }
+                        }
+                    }
+
+                    pretime_shift =
+                        li * batch_n * hy_stride + (bacc + in_n.at(ti)) * hy_stride + bi * 3 * hy_h;
+
+                    RNN_mm_cpu(&wkspace[hid_shift + 3 * hy_h],
+                               hy_h * 3,
+                               in_n.at(ti + 1),
+                               hy_stride,
+                               RNN_MM_TRANSPOSE,
+                               &rsvspace[pretime_shift + hy_h],
+                               hy_h,
+                               in_n.at(ti + 1),
+                               hy_stride,
+                               0,
+                               &dwei[wei_shift + 3 * hy_h * uni_stride],
+                               hy_h,
+                               hy_h * 3,
+                               uni_stride,
+                               0,
+                               1,
+                               1);
+
+                    if(biased)
+                    {
+                        int bias_shift = wei_shift_bias + li * 2 * wei_stride + wei_stride;
+
+                        for(int h = 0; h < hy_h * 3; h++)
+                        {
+                            for(int w = 0; w < in_n.at(ti + 1); w++)
+                            {
+                                dwei[bias_shift + 3 * hy_h + h] +=
+                                    wkspace[hid_shift + 3 * hy_h + w * hy_stride + h];
+                            }
+                        }
+                    }
+                }
+            }
+
+            bacc += in_n.at(ti);
+        }
+    }
+}
+
+//////=========END GRU CPU VERIFICATION FUNCTIONS=============
+
+/**********************************************
+ * Uniform RNN callers for verification on CPU
+ **********************************************/
+template <class T>
+void UniformRNNFwdTrainCPUVerify(
+    miopen::Handle& handle,
+    bool use_dropout,
+    const miopen::DropoutDescriptor& dropoutDesc,
+    const std::vector<T>& in,
+    const std::vector<T>& wei, // [ input_state_weight_trans
+                               // hidden_state_weight0_trans input1_trans
+                               // hidden1_trans ... output_weight;
+                               // bidirectional reversed weights ]
+    std::vector<T>& hy_host,   // current/final hidden state
+    const std::vector<T>& hx,  // initial hidden state
+    std::vector<T>& cy_host,   // current/final cell state
+    const std::vector<T>& cx,  // initial cell state
+    std::vector<T>& out_host,
+    const std::vector<int>& in_n, // input batch size
+    int in_h,                     // input data length
+    int seqLength_cpu,            // Number of iterations to unroll over
+    int bidirection,              // whether using bidirectional net
+    int biased,                   // whether using bias
+    int hy_d,                     // 1 by numlayer (number of stacks of hidden layers) for
+                                  // unidirection, 2 by numlayer for bidirection
+    int hy_n,                     // equal to input batch size in_n[0]
+    int hy_h,                     // hidden state number
+    int out_h,                    // 1 by hy_h related function for unidirection, 2 by hy_h
+                                  // related function for bidirection
+    miopenRNNMode_t rnn_mode,
+    int inputMode_cpu,
+    std::vector<T>& rsvspace,
+    bool hx_is_null,
+    bool cx_is_null)
+{
+    switch(rnn_mode)
+    {
+    case miopenRNNRELU:
+    case miopenRNNTANH:
+        RNNFwdTrainCPUVerify(handle,
+                             use_dropout,
+                             dropoutDesc,
+                             in,
+                             wei,
+                             hy_host,
+                             hx,
+                             out_host,
+                             in_n,
+                             in_h,
+                             seqLength_cpu,
+                             bidirection,
+                             biased,
+                             hy_d,
+                             hy_n,
+                             hy_h,
+                             out_h,
+                             rnn_mode,
+                             inputMode_cpu,
+                             rsvspace,
+                             hx_is_null);
+        break;
+    case miopenLSTM:
+        LSTMFwdCPUVerify(handle,
+                         use_dropout,
+                         dropoutDesc,
+                         in,
+                         wei,
+                         hy_host,
+                         hx,
+                         cy_host,
+                         cx,
+                         out_host,
+                         in_n,
+                         in_h,
+                         seqLength_cpu,
+                         bidirection,
+                         biased,
+                         hy_d,
+                         hy_n,
+                         hy_h,
+                         out_h,
+                         inputMode_cpu,
+                         rsvspace,
+                         hx_is_null,
+                         cx_is_null);
+        break;
+    case miopenGRU:
+        GRUFwdCPUVerify(handle,
+                        use_dropout,
+                        dropoutDesc,
+                        in,
+                        wei,
+                        hy_host,
+                        hx,
+                        out_host,
+                        in_n,
+                        in_h,
+                        seqLength_cpu,
+                        bidirection,
+                        biased,
+                        hy_d,
+                        hy_n,
+                        hy_h,
+                        out_h,
+                        inputMode_cpu,
+                        rsvspace,
+                        hx_is_null);
+        break;
+    default: MIOPEN_THROW("ERROR : rnn_mode unknown"); break;
+    };
+}
+
+template <class T>
+void UniformRNNBwdTrainCPUVerify(
+    bool use_dropout,
+    const miopen::DropoutDescriptor& dropoutDesc,
+    std::vector<T>& din_host,
+    const std::vector<T>& wei, // [ input_state_weight_trans
+                               // hidden_state_weight0_trans input1_trans
+                               // hidden1_trans ... output_weight;
+                               // bidirectional reversed weights ]
+    const std::vector<T>& dhy, // current/final hidden state
+    std::vector<T>& dhx_host,
+    const std::vector<T>& hx,  // initial hidden state
+    const std::vector<T>& dcy, // current/final cell state
+    std::vector<T>& dcx_host,
+    const std::vector<T>& cx,
+    const std::vector<T>& out,
+    const std::vector<T>& dout,
+    const std::vector<int>& in_n, // input batch size
+    int in_h,                     // input data length
+    int seqLength,                // Number of iterations to unroll over
+    int bidirection,              // whether using bidirectional net
+    int,                          // whether using bias
+    int hy_d,                     // 1 by numlayer (number of stacks of hidden layers)
+                                  // for unidirection, 2 by numlayer for bidirection
+    int hy_n,                     // equal to input batch size in_n[0]
+    int hy_h,                     // hidden state number
+    int out_h,                    // 1 by hy_h related function for unidirection, 2 by
+                                  // hy_h related function for bidirection
+    miopenRNNMode_t rnn_mode,
+    int inputMode,
+    std::vector<T>& rsvspace,
+    std::vector<T>& wkspace,
+    bool hx_is_null,
+    bool cx_is_null,
+    bool dhy_is_null,
+    bool dcy_is_null)
+{
+    switch(rnn_mode)
+    {
+    case miopenRNNRELU:
+    case miopenRNNTANH:
+        RNNBwdDataCPUVerify(use_dropout,
+                            dropoutDesc,
+                            din_host,
+                            wei,
+                            dhy,
+                            dhx_host,
+                            hx,
+                            out,
+                            dout,
+                            in_n,
+                            in_h,
+                            seqLength,
+                            bidirection,
+                            0,
+                            hy_d,
+                            hy_n,
+                            hy_h,
+                            out_h,
+                            rnn_mode,
+                            inputMode,
+                            rsvspace,
+                            wkspace,
+                            dhy_is_null);
+        break;
+    case miopenLSTM:
+        LSTMBwdDataCPUVerify(use_dropout,
+                             dropoutDesc,
+                             din_host,
+                             wei,
+                             dhy,
+                             dhx_host,
+                             hx,
+                             dcy,
+                             dcx_host,
+                             cx,
+                             out,
+                             dout,
+                             in_n,
+                             in_h,
+                             seqLength,
+                             bidirection,
+                             0,
+                             hy_d,
+                             hy_n,
+                             hy_h,
+                             out_h,
+                             inputMode,
+                             rsvspace,
+                             wkspace,
+                             cx_is_null,
+                             dhy_is_null,
+                             dcy_is_null);
+        break;
+    case miopenGRU:
+        GRUBwdDataCPUVerify(use_dropout,
+                            dropoutDesc,
+                            din_host,
+                            wei,
+                            dhy,
+                            dhx_host,
+                            hx,
+                            out,
+                            dout,
+                            in_n,
+                            in_h,
+                            seqLength,
+                            bidirection,
+                            0,
+                            hy_d,
+                            hy_n,
+                            hy_h,
+                            out_h,
+                            inputMode,
+                            rsvspace,
+                            wkspace,
+                            hx_is_null,
+                            dhy_is_null);
+        break;
+    default: MIOPEN_THROW("ERROR : rnn_mode unknown"); break;
+    };
+}
+
+template <class T>
+void UniformRNNBwdWeightCPUVerify(bool use_dropout,
+                                  const std::vector<T>& in,
+                                  std::vector<T>& dwei_host, // [ input_state_weight_trans
+                                                             // hidden_state_weight0_trans
+                                                             // input1_trans hidden1_trans ...
+                                                             // output_weight; bidirectional
+                                                             // reversed weights ]
+                                  const std::vector<T>& hx,  // initial hidden state
+                                  const std::vector<T>& dout,
+                                  const std::vector<int>& in_n, // input batch size
+                                  int in_h,                     // input data length
+                                  int seqLength,   // Number of iterations to unroll over
+                                  int bidirection, // whether using bidirectional net
+                                  int biased,      // whether using bias
+                                  int hy_d,        // 1 by numlayer (number of stacks of hidden
+                                                   // layers) for unidirection, 2 by numlayer for
+                                                   // bidirection
+                                  int hy_n,        // equal to input batch size in_n[0]
+                                  int hy_h,        // hidden state number
+                                  int out_h,       // 1 by hy_h related function for unidirection, 2
+                                                   // by hy_h related function for bidirection
+                                  miopenRNNMode_t rnn_mode,
+                                  int inputMode,
+                                  const std::vector<T>& rsvspace,
+                                  std::vector<T>& wkspace,
+                                  bool hx_is_null)
+{
+    switch(rnn_mode)
+    {
+    case miopenRNNRELU:
+    case miopenRNNTANH:
+        RNNBwdWeightCPUVerify(use_dropout,
+                              in,
+                              dwei_host,
+                              hx,
+                              dout,
+                              in_n,
+                              in_h,
+                              seqLength,
+                              bidirection,
+                              biased,
+                              hy_d,
+                              hy_n,
+                              hy_h,
+                              out_h,
+                              rnn_mode,
+                              inputMode,
+                              rsvspace,
+                              wkspace,
+                              hx_is_null);
+        break;
+    case miopenLSTM:
+        LSTMBwdWeightCPUVerify(use_dropout,
+                               in,
+                               dwei_host,
+                               hx,
+                               dout,
+                               in_n,
+                               in_h,
+                               seqLength,
+                               bidirection,
+                               biased,
+                               hy_d,
+                               hy_n,
+                               hy_h,
+                               out_h,
+                               inputMode,
+                               rsvspace,
+                               wkspace,
+                               hx_is_null);
+        break;
+    case miopenGRU:
+        GRUBwdWeightCPUVerify(use_dropout,
+                              in,
+                              dwei_host,
+                              hx,
+                              in_n,
+                              in_h,
+                              seqLength,
+                              bidirection,
+                              biased,
+                              hy_d,
+                              hy_n,
+                              hy_h,
+                              inputMode,
+                              rsvspace,
+                              wkspace,
+                              hx_is_null);
+        break;
+    default: MIOPEN_THROW("ERROR : rnn_mode unknown"); break;
+    };
+}
+
+inline size_t GetUniRNNCPUWorkspaceScaleSize(int rnn_mode)
+{
+    switch(rnn_mode)
+    {
+    case miopenRNNTANH:
+    case miopenRNNRELU: return 1;
+    case miopenLSTM: return 6;
+    case miopenGRU: return 4;
+    default: MIOPEN_THROW("ERROR : rnn_mode unknown"); break;
+    }
+}
+
+inline size_t GetUniRNNCPUHiddenTensors(int rnn_mode)
+{
+    switch(rnn_mode)
+    {
+    case miopenRNNTANH:
+    case miopenRNNRELU: return 1;
+    case miopenLSTM: return 4;
+    case miopenGRU: return 3;
+    default: MIOPEN_THROW("ERROR : rnn_mode unknown"); break;
+    }
+}
+
+inline size_t UniRNNCPUReserveSpaceSize(miopenRNNMode_t rnn_mode,
+                                        size_t nLayers,
+                                        size_t inputTotalBatch,
+                                        size_t outH,
+                                        size_t sizeOfT,
+                                        bool use_dropout)
+{
+    int workspace_scale = GetUniRNNCPUWorkspaceScaleSize(rnn_mode);
+
+    size_t reserveSpaceSize         = 2ULL * workspace_scale * nLayers * inputTotalBatch * outH;
+    size_t dropout_reserveSpaceSize = 0;
+    if(use_dropout)
+    {
+        dropout_reserveSpaceSize = (nLayers - 1) * inputTotalBatch * outH;
+        dropout_reserveSpaceSize += (dropout_reserveSpaceSize + sizeOfT - 1) / sizeOfT;
+    }
+    return reserveSpaceSize + dropout_reserveSpaceSize;
+}
+
+inline size_t UniRNNCPUWorkSpaceByteSize(miopenRNNMode_t rnn_mode,
+                                         size_t nLayers,
+                                         size_t inputTotalBatch,
+                                         size_t hidVec,
+                                         size_t sizeOfT,
+                                         bool isBidirect)
+{
+    int workspace_scale = GetUniRNNCPUWorkspaceScaleSize(rnn_mode);
+    return (workspace_scale * nLayers * inputTotalBatch * hidVec * sizeOfT) * (isBidirect ? 2 : 1);
+}
+
+inline size_t UniRNNCPUHiddenStateSize(size_t hiddenLayers, size_t hiddenBatchSize, size_t hidVec)
+{
+    return hiddenLayers * hiddenBatchSize * hidVec;
+}
+
+inline size_t UniRNNCPUCellStateSize(miopenRNNMode_t rnn_mode,
+                                     size_t hiddenLayers,
+                                     size_t hiddenBatchSize,
+                                     size_t hidVec)
+{
+    return rnn_mode == miopenLSTM ? UniRNNCPUHiddenStateSize(hiddenLayers, hiddenBatchSize, hidVec)
+                                  : 0;
+}
+
+inline size_t UniRNNCPUIOSize(size_t TotalBatchsPerSeqLen, size_t ioVecLen)
+{
+    return TotalBatchsPerSeqLen * ioVecLen;
+}
+
+inline size_t UniRNNCPUWeightSize(miopenRNNMode_t rnn_mode,
+                                  size_t nLayers,
+                                  size_t hidVec,
+                                  size_t inVec,
+                                  bool biasMode,
+                                  bool inputMode,
+                                  bool dirMode)
+{
+    const size_t hidden_tensors_per_layer = GetUniRNNCPUHiddenTensors(rnn_mode);
+    if(inputMode)
+    {
+        inVec = 0;
+    }
+
+    int bi  = dirMode ? 2 : 1;
+    auto sz = hidden_tensors_per_layer * hidVec * bi *
+              (inVec + hidVec + (nLayers - 1) * (bi + 1) * hidVec);
+
+    if(biasMode)
+    {
+        sz += nLayers * 2 * hidden_tensors_per_layer * hidVec * bi;
+    }
+    return sz;
+}
diff --git a/test/driver.hpp b/test/driver.hpp
index f29e6efa4d..825a0d4511 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -688,7 +688,8 @@ struct test_driver
                               << out_gpu[gpu_nan_idx] << std::endl;
                 }
             }
-            else if(miopen::range_zero(out_cpu) and miopen::range_zero(out_gpu))
+            else if(miopen::range_zero(out_cpu) and miopen::range_zero(out_gpu) and
+                    (miopen::range_distance(out_cpu) != 0))
             {
                 show_command();
                 std::cout << "Warning: Both CPU and GPU data is all zero" << std::endl;
diff --git a/test/lstm_common.hpp b/test/lstm_common.hpp
index 1f0d5f98cb..a09aab4209 100644
--- a/test/lstm_common.hpp
+++ b/test/lstm_common.hpp
@@ -35,7 +35,7 @@
 #include "verify.hpp"
 #include "rnn_util.hpp"
 #include "random.hpp"
-#include "cpu_lstm.hpp"
+#include "cpu_rnn.hpp"
 #include <array>
 #include <cmath>
 #include <ctime>
@@ -229,10 +229,10 @@ struct verify_backward_data_lstm
 
         switch(badtensor)
         {
-        case(0): std::cout << "Output dx failed verification." << std::endl; break;
-        case(1): std::cout << "Hidden state dhx tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Hidden cell dcx tensor failed verification." << std::endl; break;
-        case(3): std::cout << "Workspace space tensor failed verification." << std::endl; break;
+        case(0): std::cout << "Output dx tensor report." << std::endl; break;
+        case(1): std::cout << "Hidden state dhx tensor report." << std::endl; break;
+        case(2): std::cout << "Hidden cell dcx tensor report." << std::endl; break;
+        case(3): std::cout << "Workspace space tensor report." << std::endl; break;
         default: break;
         }
     }
@@ -641,7 +641,7 @@ struct verify_forward_infer_lstm : verify_forward_lstm<T>
         std::cout << "hz: " << hiddenSize << " batch_n: " << batch_n << " seqLength: " << seqLength
                   << " inputLen: " << inputVecLen << " numLayers: " << nLayers << std::endl;
         std::cout << "Forward Inference LSTM: " << std::endl;
-        std::cout << "Output tensor output failed verification." << std::endl;
+        std::cout << "Output tensor report." << std::endl;
     }
 };
 //~~~~~~~~~~~~ END FWD INFERENCE ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1022,9 +1022,9 @@ struct verify_forward_train_lstm : verify_forward_lstm<T>
 
         switch(badtensor)
         {
-        case(0): std::cout << "Output tensor output failed verification." << std::endl; break;
-        case(1): std::cout << "Hidden state tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Cell state tensor failed verification." << std::endl; break;
+        case(0): std::cout << "Output tensor report." << std::endl; break;
+        case(1): std::cout << "Hidden state tensor report." << std::endl; break;
+        case(2): std::cout << "Cell state tensor report." << std::endl; break;
         default: break;
         }
     }
diff --git a/test/rnn_seq_api.cpp b/test/rnn_seq_api.cpp
new file mode 100644
index 0000000000..cb9a4c8f51
--- /dev/null
+++ b/test/rnn_seq_api.cpp
@@ -0,0 +1,179 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "rnn_seq_api.hpp"
+
+template <class T>
+struct rnn_seq_driver : rnn_seq_api_test_driver<T>
+{
+    rnn_seq_driver() : rnn_seq_api_test_driver<T>()
+    {
+        std::vector<int> modes(2, 0);
+        modes[1] = 1;
+
+        this->add(this->inVecLen, "vector-len", this->generate_data(std::vector<int>{1, 7}, 7));
+        this->add(this->hiddenSize, "hidden-size", this->generate_data({7, 1, 13}, 13));
+        this->add(this->useDropout, "use-dropout", this->generate_data({0, 1}));
+        this->add(this->inputMode, "in-mode", this->generate_data(modes));
+        this->add(this->biasMode, "bias-mode", this->generate_data({1}));
+        this->add(this->dirMode, "dir-mode", this->generate_data(modes));
+        this->add(this->rnnMode, "rnn-mode", this->generate_data({2, 1, 3}, 2));
+        this->add(this->algoMode, "algo-mode", this->generate_data({0}));
+        this->add(this->numLayers, "num-layers", this->generate_data({1, 3}, 3));
+        this->add(this->io_layout, "io_layout", this->generate_data({2, 1, 3}, 3));
+        this->add(this->batchSize, "batch-size", this->generate_data({1, 4, 6}, 6));
+        this->add(this->seqLength, "seq-len", this->generate_data(std::vector<int>{1, 4, 15}, 15));
+        this->add(this->seqLenArray,
+                  "seqLen-batch",
+                  this->generate_data({
+                      {1, 15, 14, 15, 14, 1},
+                      {1, 0, 3, 4, 2, 0},
+                      {1, 2, 3, 4},
+                      {4, 3, 2, 1},
+                      {4, 4, 4, 4},
+                      {1},
+                  }));
+
+        this->add(this->nohx, "nohx", this->generate_data({false}));
+        this->add(this->nocx, "nocx", this->generate_data({false, true}));
+        this->add(this->nohy, "nohy", this->generate_data({false}));
+        this->add(this->nocy, "nocy", this->generate_data({false, true}));
+    }
+
+    rnn_seq_driver(bool) : rnn_seq_api_test_driver<T>() {}
+    bool is_skip_comb()
+    {
+        if(!this->seqLenArray.empty())
+        {
+            if(this->seqLenArray.size() != this->batchSize)
+                return true;
+
+            bool is_seqLength_is_max_seq =
+                this->seqLength ==
+                *std::max_element(this->seqLenArray.begin(), this->seqLenArray.end());
+
+            if(!is_seqLength_is_max_seq)
+                return true;
+        }
+
+        return false;
+    }
+
+    bool is_correct_params()
+    {
+        if(this->useDropout == 1 && (this->hiddenSize == 1 || this->batchSize == 1))
+            return false;
+
+        if(this->inputMode == 1 && this->hiddenSize != this->inVecLen)
+            return false;
+
+        if((this->rnnMode != 2) && (!this->nocx || !this->nocy))
+            return false;
+
+        if(this->seqLenArray.size() > this->batchSize)
+            return false;
+
+        if(!this->seqLenArray.empty())
+        {
+            if(this->seqLength <
+               *std::max_element(this->seqLenArray.begin(), this->seqLenArray.end()))
+                return false;
+
+            if(this->io_layout == 1)
+            {
+                return std::is_sorted(
+                    this->seqLenArray.begin(), this->seqLenArray.end(), std::greater<int>());
+            }
+        }
+        return true;
+    }
+
+    void run()
+    {
+        if(!this->full_set || (is_correct_params() && !is_skip_comb()))
+            rnn_seq_api_test_driver<T>::run();
+        else
+        {
+            if(this->verbose)
+                std::cout << "Incompatible argument combination, test skipped: "
+                          << this->get_command_args() << std::endl;
+        }
+    }
+};
+
+template <class T>
+struct lstm_MS_solver : rnn_seq_driver<T>
+{
+    lstm_MS_solver() : rnn_seq_driver<T>(true)
+    {
+        std::vector<int> modes(2, 0);
+        modes[1] = 1;
+
+        this->add(this->inVecLen, "vector-len", this->generate_data(std::vector<int>{1, 7}, 7));
+        this->add(this->hiddenSize, "hidden-size", this->generate_data({13, 1}, 13));
+        this->add(this->useDropout, "use-dropout", this->generate_data({0}));
+        this->add(this->numLayers, "num-layers", this->generate_data({3}));
+        this->add(this->inputMode, "in-mode", this->generate_data({0}));
+        this->add(this->biasMode, "bias-mode", this->generate_data(modes));
+        this->add(this->dirMode, "dir-mode", this->generate_data({0}));
+        this->add(this->rnnMode, "rnn-mode", this->generate_data({2}));
+        this->add(this->algoMode, "algo-mode", this->generate_data({0}));
+
+        this->add(this->io_layout, "io_layout", this->generate_data({2}, 2));
+        this->add(this->batchSize, "batch-size", this->generate_data({1, 6}, 6));
+        this->add(this->seqLength, "seq-len", this->generate_data({38}));
+        this->add(this->seqLenArray,
+                  "seqLen-batch",
+                  this->generate_data({
+                      {34, 3, 2, 1},
+                      {1, 15, 34, 15, 34, 1},
+                      {},
+                  }));
+
+        this->add(this->nohx, "nohx", this->generate_data(modes));
+        this->add(this->nocx, "nocx", this->generate_data(modes));
+        this->add(this->nohy, "nohy", this->generate_data(modes));
+        this->add(this->nocy, "nocy", this->generate_data(modes));
+    }
+
+    void run()
+    {
+        // WA skip this test
+        if(this->nohx && this->biasMode == 1)
+            return;
+
+        // Optimization of test coverage.
+        // Non-float types are not used in this code-path and must be tested using another subtest.
+        if(this->type == miopenFloat)
+            rnn_seq_driver<T>::run();
+    }
+};
+
+int main(int argc, const char* argv[])
+{
+    test_drive<rnn_seq_driver>(argc, argv);
+    test_drive<lstm_MS_solver>(argc, argv);
+}
diff --git a/test/rnn_seq_api.hpp b/test/rnn_seq_api.hpp
new file mode 100644
index 0000000000..38f834d772
--- /dev/null
+++ b/test/rnn_seq_api.hpp
@@ -0,0 +1,1656 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "driver.hpp"
+#include "dropout_util.hpp"
+#include "get_handle.hpp"
+
+#include "random.hpp"
+#include <random>
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+
+#include <miopen/rnn.hpp>
+#include <miopen/miopen.h>
+#include <miopen/float_equal.hpp>
+
+//
+// Native rnn tensor format
+//
+#include "seq_tensor_holder.hpp"
+#include "rnn_util.hpp"
+#include "cpu_rnn.hpp"
+///
+
+template <class TensorT>
+miopen::Allocator::ManageDataPtr
+createTensorAtGPUOrNullptr(miopen::Handle& handle, TensorT& tensor, bool is_nullptr)
+{
+    if(!is_nullptr)
+        return handle.Create(tensor.GetDataByteSize());
+    else
+        return nullptr;
+}
+
+template <class TensorT>
+miopen::Allocator::ManageDataPtr
+transferTensorToGPUOrNullptr(miopen::Handle& handle, TensorT& tensor, bool is_nullptr)
+{
+    if(!is_nullptr)
+        return handle.Write(tensor.data);
+    else
+        return nullptr;
+}
+
+// read from GPU cnt elements of type T
+template <template <class T> typename TensorT, class T>
+auto readTFromGPUOrEmpty(miopen::Handle& handle,
+                         miopen::Allocator::ManageDataPtr& gpuPtr,
+                         const TensorT<T>& tensor,
+                         bool isEmpty) -> decltype(handle.Read<T>(gpuPtr, tensor.GetSize()))
+{
+    if(!isEmpty)
+        return handle.Read<T>(gpuPtr, tensor.GetSize());
+    else
+        return {};
+}
+
+//****************************************************
+// FORWARD BASE
+//****************************************************
+template <class T>
+struct verify_rnn_api_base
+{
+    seqTensor<T> input{};
+    seqTensor<T> output{};
+
+    tensor<T> xHiddenState{};
+    tensor<T> xCellState{};
+
+    std::vector<T> weights{};
+    miopen::RNNDescriptor rnnDesc{};
+
+    bool nohx{};
+    bool nocx{};
+    bool nohy{};
+    bool nocy{};
+    bool is_padded_verification{};
+
+    T padding_symbol{};
+
+    verify_rnn_api_base(miopen::RNNDescriptor& pRD,
+                        seqTensor<T>& x,
+                        seqTensor<T>& y,
+                        tensor<T>& hx,
+                        tensor<T>& cx,
+                        std::vector<T>& w,
+                        const bool pnohx = false,
+                        const bool pnocx = false,
+                        const bool pnohy = false,
+                        const bool pnocy = false,
+                        T* paddingSymbol = nullptr)
+        : input(x),
+          output(y),
+          xHiddenState(hx),
+          xCellState(cx),
+          weights(w),
+          rnnDesc(pRD),
+          nohx(pnohx),
+          nocx(pnocx),
+          nohy(pnohy),
+          nocy(pnocy)
+    {
+        if(paddingSymbol != nullptr)
+        {
+            is_padded_verification = input.desc.IsPaddedSeqLayout();
+            padding_symbol         = *paddingSymbol;
+        }
+        else
+        {
+            is_padded_verification = false;
+            padding_symbol         = 0;
+        }
+    }
+
+    size_t total_GPU_mem_size() {}
+    size_t input_GPU_mem_size() {}
+    size_t output_GPU_mem_size() {}
+    size_t workspace_GPU_mem_size() {}
+    size_t reservspace_GPU_mem_size() {}
+
+    void fail(int badtensor) const
+    {
+        std::cout << "./bin/MIOpenDriver rnn_seq ";
+
+        std::cout << " -F 0 "
+                  << " -m ";
+
+        switch(rnnDesc.rnnMode)
+        {
+        case miopenRNNTANH: std::cout << " tanh "; break;
+        case miopenRNNRELU: std::cout << " relu "; break;
+        case miopenLSTM: std::cout << " lstm "; break;
+        case miopenGRU: std::cout << " gru "; break;
+        default: break;
+        }
+
+        auto& inLens = input.desc.GetLengths();
+        auto& hLens  = xHiddenState.desc.GetLengths();
+
+        std::cout << " --batch_size " << inLens[0] << " --seq_len " << inLens[1] << " --in_vec "
+                  << inLens[2] << " --hid_h " << hLens[2] << " --num_layer " << rnnDesc.nLayers
+                  << " -r " << rnnDesc.dirMode << " -b " << rnnDesc.biasMode << " -p "
+                  << rnnDesc.inputMode << " -a " << rnnDesc.algoMode;
+
+        bool useDropout = !miopen::float_equal(miopen::deref(rnnDesc.dropoutDesc).dropout, 0);
+
+        std::cout << " --io_layout "
+                  << miopen::RNNDescriptor::getBaseLayoutFromDataTensor(input.desc)
+                  << " --use_dropout " << useDropout;
+        if(useDropout)
+            std::cout << " --dropout " << miopen::deref(rnnDesc.dropoutDesc).dropout;
+
+        auto& samplesLen = input.desc.GetSequenceLengthsVector();
+        std::cout << " --seq_len_array ";
+        for(int i = 0; i < inLens[0]; i++)
+        {
+            if(i < inLens[0] - 1)
+            {
+                std::cout << samplesLen.at(i) << ",";
+            }
+            else
+            {
+                std::cout << samplesLen.at(i);
+            }
+        }
+        std::cout << std::endl;
+
+        if(badtensor >= 0)
+        {
+            if(badtensor < 3)
+            {
+                std::cout << "FWD Train LSTM: " << std::endl;
+                switch(badtensor)
+                {
+                case(0): std::cout << "Output tensor report." << std::endl; break;
+                case(1): std::cout << "Hidden state tensor report." << std::endl; break;
+                case(2): std::cout << "Cell state tensor report." << std::endl; break;
+                default: break;
+                }
+            }
+            else if(badtensor < 6)
+            {
+                std::cout << "BWD Train LSTM: " << std::endl;
+                switch(badtensor)
+                {
+                case(3): std::cout << "Output tensor output report." << std::endl; break;
+                case(4): std::cout << "Hidden state tensor report." << std::endl; break;
+                case(5): std::cout << "Cell state tensor report." << std::endl; break;
+                default: break;
+                }
+            }
+            else if(badtensor == 6)
+            {
+                std::cout << "WRW Train LSTM " << std::endl;
+            }
+        }
+    }
+};
+
+//****************************************************
+// RNN TRAIN
+//****************************************************
+
+template <class T>
+struct rnn_ref
+{
+    struct FwdResult
+    {
+        std::vector<T> y;
+        std::vector<T> hy;
+        std::vector<T> cy;
+        FwdResult(std::vector<T> fwd_y,
+                  std::vector<T> fwd_hy,
+                  std::vector<T> fwd_cy,
+                  bool nohy,
+                  bool nocy)
+            : y(std::move(fwd_y)),
+              hy(nohy ? std::vector<T>{} : std::move(fwd_hy)),
+              cy(nocy ? std::vector<T>{} : std::move(fwd_cy))
+        {
+        }
+    };
+
+    struct BwdResult
+    {
+        std::vector<T> din;
+        std::vector<T> dhx;
+        std::vector<T> dcx;
+        BwdResult(std::vector<T> bwd_din,
+                  std::vector<T> bwd_dhx,
+                  std::vector<T> bwd_dcx,
+                  bool nodhx,
+                  bool nodcx)
+            : din(std::move(bwd_din)),
+              dhx(nodhx ? std::vector<T>{} : std::move(bwd_dhx)),
+              dcx(nodcx ? std::vector<T>{} : std::move(bwd_dcx))
+        {
+        }
+    };
+
+    struct WrwResult
+    {
+        std::vector<T> dwei;
+
+        WrwResult(std::vector<T> wrw_dwei) : dwei(std::move(wrw_dwei)) {}
+    };
+
+    virtual size_t getReserveSpaceSize() const = 0;
+
+    virtual size_t getWorkSpaceSize() const = 0;
+
+    virtual FwdResult fwd(const miopen::SeqTensorDescriptor& xDesc,
+                          const miopen::SeqTensorDescriptor& yDesc,
+                          const std::vector<T>& xData,
+                          const std::vector<T>& hxData,
+                          const std::vector<T>& cxData,
+                          const std::vector<T>& wData,
+                          std::vector<T>& reserveSpace,
+                          bool nohx,
+                          bool nocx,
+                          bool nohy,
+                          bool nocy) const = 0;
+
+    virtual BwdResult bwd(const miopen::SeqTensorDescriptor& xDesc,
+                          const miopen::SeqTensorDescriptor& yDesc,
+                          const std::vector<T>& dyData,
+                          const std::vector<T>& dhyData,
+                          const std::vector<T>& dcyData,
+                          const std::vector<T>& hxData,
+                          const std::vector<T>& cxData,
+                          const std::vector<T>& weiData,
+                          std::vector<T>& reserveSpace,
+                          std::vector<T>& workSpace,
+                          bool nodhx,
+                          bool nodcx,
+                          bool nodhy,
+                          bool nodcy,
+                          bool nohx,
+                          bool nocx) const = 0;
+
+    virtual WrwResult wrw(const miopen::SeqTensorDescriptor& xDesc,
+                          const miopen::SeqTensorDescriptor& dyDesc,
+                          const std::vector<T>& xData,
+                          const std::vector<T>& hxData,
+                          const std::vector<T>& doutData,
+                          std::vector<T>& reserveSpace,
+                          std::vector<T>& workSpace,
+                          bool nohx) const = 0;
+
+    virtual ~rnn_ref(){};
+};
+
+template <class T>
+struct cpu_rnn_packed_ref : public rnn_ref<T>
+{
+    using typename rnn_ref<T>::FwdResult;
+    using typename rnn_ref<T>::BwdResult;
+    using typename rnn_ref<T>::WrwResult;
+
+    // supported only hDesc equal to cDesc
+    cpu_rnn_packed_ref(const miopen::RNNDescriptor& rnn,
+                       const miopen::SeqTensorDescriptor& maxX,
+                       const miopen::TensorDescriptor& hPackedDesc)
+        : rnnDesc(rnn)
+    {
+        assert(checkSeqTensor(maxX));
+
+        dirMode    = rnnDesc.dirMode == miopenRNNbidirection;
+        biasMode   = rnnDesc.biasMode == miopenRNNwithBias;
+        inputMode  = rnnDesc.inputMode == miopenRNNskip;
+        rnnMode    = rnnDesc.rnnMode;
+        useDropout = !miopen::float_equal(miopen::deref(rnnDesc.dropoutDesc).dropout, 0);
+
+        std::tie(hiddenLayers, std::ignore, hidVec) = miopen::tien<3>(hPackedDesc.GetLengths());
+
+        outVec = hidVec * (dirMode ? 2 : 1);
+
+        inVec                     = maxX.GetLengths()[2];
+        size_t input_batchLen_sum = maxX.GetTotalSequenceLen();
+
+        reserveSpaceSizeCpu = UniRNNCPUReserveSpaceSize(
+            rnnMode, rnnDesc.nLayers, input_batchLen_sum, outVec, sizeof(T), useDropout);
+
+        workSpaceSizeCpu = UniRNNCPUWorkSpaceByteSize(
+            rnnMode, rnnDesc.nLayers, input_batchLen_sum, rnn.hsize, sizeof(T), dirMode);
+    }
+
+    size_t getReserveSpaceSize() const override { return reserveSpaceSizeCpu; }
+
+    size_t getWorkSpaceSize() const override { return workSpaceSizeCpu; }
+
+    FwdResult fwd(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor&,
+                  const std::vector<T>& xData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& cxData,
+                  const std::vector<T>& wData,
+                  std::vector<T>& reserveSpace,
+                  bool nohx,
+                  bool nocx,
+                  bool nohy,
+                  bool nocy) const override
+    {
+        assert(checkSeqTensor(xDesc));
+
+        auto&& handle = get_handle();
+
+        auto batch_seq      = xDesc.GetBatchesPerSequence();
+        size_t seq_len      = batch_seq.size();
+        size_t total_batchs = std::accumulate(batch_seq.begin(), batch_seq.end(), 0ULL);
+        size_t batch_size   = batch_seq.at(0);
+
+        bool is_state_tensor_zip_req = (batch_size != xDesc.GetLengths()[0]);
+        bool is_hx_zip_req           = is_state_tensor_zip_req && !nohx;
+        bool is_cx_zip_req           = is_state_tensor_zip_req && !nocx;
+
+        std::vector<int> batch_seq_downgrade(batch_seq.cbegin(), batch_seq.cend());
+
+        std::vector<T> hidden_state(UniRNNCPUHiddenStateSize(hiddenLayers, batch_size, hidVec));
+        std::vector<T> cell_state(
+            UniRNNCPUCellStateSize(rnnDesc.rnnMode, hiddenLayers, batch_size, hidVec));
+
+        std::vector<T> packed_output(UniRNNCPUIOSize(total_batchs, outVec));
+
+        UniformRNNFwdTrainCPUVerify(
+            handle,
+            useDropout,
+            miopen::deref(rnnDesc.dropoutDesc),
+            xData,
+            wData,        // [ input_state_weight_trans
+                          // hidden_state_weight0_trans input1_trans
+                          // hidden1_trans ... output_weight;
+                          // bidirectional reversed weights ]
+            hidden_state, // current/final hidden state
+            is_hx_zip_req ? zipStateVectorTensor(
+                                hxData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                          : hxData, // initial hidden state
+            cell_state,             // current/final cell state
+            is_cx_zip_req ? zipStateVectorTensor(
+                                cxData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                          : cxData, // initial cell state
+            packed_output,
+            batch_seq_downgrade, // input batch size
+            inVec,               // input data length
+            seq_len,             // Number of iterations to unroll over
+            dirMode,             // whether using bidirectional net
+            biasMode,            // whether using bias
+            hiddenLayers,        // 1 by numlayer (number of stacks of hidden layers)
+                                 // for unidirection, 2 by numlayer for bidirection
+            batch_size,          // equal to input batch size in_n[0]
+            hidVec,              // hidden state number
+            outVec,              // 1 by hy_h related function for unidirection, 2 by hy_h
+                                 // related function for bidirection
+            rnnMode,
+            inputMode,
+            reserveSpace,
+            nohx,
+            nocx);
+
+        if(is_state_tensor_zip_req)
+        {
+            return {std::move(packed_output),
+                    nohy ? std::move(hidden_state)
+                         : zipStateVectorTensor(std::move(hidden_state),
+                                                hiddenLayers,
+                                                batch_size,
+                                                xDesc.GetLengths()[0],
+                                                hidVec),
+                    nocy ? std::move(cell_state)
+                         : zipStateVectorTensor(std::move(cell_state),
+                                                hiddenLayers,
+                                                batch_size,
+                                                xDesc.GetLengths()[0],
+                                                hidVec),
+                    nohy,
+                    nocy};
+        }
+        else
+            return {std::move(packed_output),
+                    std::move(hidden_state),
+                    std::move(cell_state),
+                    nohy,
+                    nocy};
+    }
+
+    BwdResult bwd(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor&,
+                  const std::vector<T>& dyData,
+                  const std::vector<T>& dhyData,
+                  const std::vector<T>& dcyData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& cxData,
+                  const std::vector<T>& weiData,
+                  std::vector<T>& reserveSpace,
+                  std::vector<T>& workSpace,
+                  bool nodhx,
+                  bool nodcx,
+                  bool nodhy,
+                  bool nodcy,
+                  bool nohx,
+                  bool nocx) const override
+    {
+        assert(checkSeqTensor(xDesc));
+
+        auto batch_seq    = xDesc.GetBatchesPerSequence();
+        size_t seq_len    = batch_seq.size();
+        size_t batch_size = batch_seq.at(0);
+
+        bool is_state_tensor_zip_req = (batch_size != xDesc.GetLengths()[0]);
+        bool is_dhy_zip_req          = is_state_tensor_zip_req && !nodhy;
+        bool is_dcy_zip_req          = is_state_tensor_zip_req && !nodcy;
+        bool is_hx_zip_req           = is_state_tensor_zip_req && !nohx;
+        bool is_cx_zip_req           = is_state_tensor_zip_req && !nocx;
+
+        size_t total_batchs = std::accumulate(batch_seq.begin(), batch_seq.end(), 0ULL);
+
+        std::vector<int> batch_seq_downgrade(batch_seq.cbegin(), batch_seq.cend());
+
+        std::vector<T> d_hidden_state(UniRNNCPUHiddenStateSize(hiddenLayers, batch_size, hidVec));
+        std::vector<T> d_cell_state(
+            UniRNNCPUCellStateSize(rnnDesc.rnnMode, hiddenLayers, batch_size, hidVec));
+        std::vector<T> packed_dInput(UniRNNCPUIOSize(total_batchs, inVec));
+
+        UniformRNNBwdTrainCPUVerify(
+            useDropout,
+            miopen::deref(rnnDesc.dropoutDesc),
+            packed_dInput, // DX (output)
+            weiData,       // [ input_state_weight_trans
+                           //   hidden_state_weight0_trans input1_trans
+                           //   hidden1_trans ... output_weight;
+                           //   bidirectional reversed weights ]
+            is_dhy_zip_req ? zipStateVectorTensor(
+                                 dhyData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                           : dhyData, // current/final hidden state
+            d_hidden_state,           // DHX (output)
+            is_hx_zip_req ? zipStateVectorTensor(
+                                hxData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                          : hxData, // HX initial hidden state
+            is_dcy_zip_req ? zipStateVectorTensor(
+                                 dcyData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                           : dcyData, // DCY current/final cell state
+            d_cell_state,             // DCX (output)
+            is_cx_zip_req ? zipStateVectorTensor(
+                                cxData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                          : cxData, // CX
+            {},                     // Y
+            dyData,                 // DY
+
+            batch_seq_downgrade, // input batch size
+            inVec,               // input data length
+            seq_len,             // Number of iterations to unroll over
+            dirMode,             // whether using bidirectional net
+            biasMode,            // whether using bias
+            hiddenLayers,        // 1 by numlayer (number of stacks of hidden layers)
+                                 // for unidirection, 2 by numlayer for bidirection
+            batch_size,          // equal to input batch size in_n[0]
+            hidVec,              // hidden state number
+            outVec,              // 1 by hy_h related function for unidirection, 2 by
+                                 // hy_h related function for bidirection
+            rnnMode,
+            inputMode,
+            reserveSpace,
+            workSpace,
+            nohx,
+            nocx,
+            nodhy,
+            nodcy);
+
+        if(is_state_tensor_zip_req)
+        {
+            return {std::move(packed_dInput),
+                    nodhx ? std::move(d_hidden_state)
+                          : zipStateVectorTensor(std::move(d_hidden_state),
+                                                 hiddenLayers,
+                                                 batch_size,
+                                                 xDesc.GetLengths()[0],
+                                                 hidVec),
+                    nodcx ? std::move(d_cell_state)
+                          : zipStateVectorTensor(std::move(d_cell_state),
+                                                 hiddenLayers,
+                                                 batch_size,
+                                                 xDesc.GetLengths()[0],
+                                                 hidVec),
+                    nodhx,
+                    nodcx};
+        }
+        else
+            return {std::move(packed_dInput),
+                    std::move(d_hidden_state),
+                    std::move(d_cell_state),
+                    nodhx,
+                    nodcx};
+    }
+
+    WrwResult wrw(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor&,
+                  const std::vector<T>& xData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& doutData,
+                  std::vector<T>& reserveSpace,
+                  std::vector<T>& workSpace,
+                  bool nohx) const override
+    {
+        assert(checkSeqTensor(xDesc));
+
+        auto batch_seq = xDesc.GetBatchesPerSequence();
+
+        std::vector<int> batch_seq_downgrade(batch_seq.cbegin(), batch_seq.cend());
+
+        size_t seq_len    = batch_seq.size();
+        size_t batch_size = batch_seq.at(0);
+
+        bool is_state_tensor_zip_req = (batch_size != xDesc.GetLengths()[0]);
+        bool is_hx_zip_req           = is_state_tensor_zip_req && !nohx;
+
+        std::vector<T> dwei_data(UniRNNCPUWeightSize(
+            rnnMode, rnnDesc.nLayers, hidVec, inVec, biasMode, inputMode, dirMode));
+
+        UniformRNNBwdWeightCPUVerify(
+            useDropout,
+            xData,
+            dwei_data, // (output) [ input_state_weight_trans
+                       // hidden_state_weight0_trans
+                       // input1_trans hidden1_trans ...
+                       // output_weight; bidirectional
+                       // reversed weights ]
+            is_hx_zip_req ? zipStateVectorTensor(
+                                hxData, hiddenLayers, xDesc.GetLengths()[0], batch_size, hidVec)
+                          : hxData, // initial hidden state
+            doutData,
+            batch_seq_downgrade, // input batch size
+            inVec,               // input data length
+            seq_len,             // Number of iterations to unroll over
+            dirMode,             // whether using bidirectional net
+            biasMode,            // whether using bias
+            hiddenLayers,        // 1 by numlayer (number of stacks of hidden
+                                 // layers) for unidirection, 2 by numlayer for
+                                 // bidirection
+            batch_size,          // equal to input batch size in_n[0]
+            hidVec,              // hidden state number
+            outVec,              // 1 by hy_h related function for unidirection, 2
+                                 // by hy_h related function for bidirection
+            rnnMode,
+            inputMode,
+            reserveSpace,
+            workSpace,
+            nohx);
+
+        return {std::move(dwei_data)};
+    }
+
+private:
+    bool checkSeqTensor(const miopen::SeqTensorDescriptor& desc) const
+    {
+        bool ret = true;
+        ret &= desc.IsPacked();
+        ret &= miopenRNNDataSeqMajorNotPadded ==
+               miopen::RNNDescriptor::getBaseLayoutFromDataTensor(desc);
+        return ret;
+    }
+
+    // to remove zero size samples
+    std::vector<T> zipStateVectorTensor(const std::vector<T>& data,
+                                        size_t nLayers,
+                                        size_t inBatchSize,
+                                        size_t outBatchSize,
+                                        size_t vecSize) const
+    {
+        std::vector<T> ret(nLayers * outBatchSize * vecSize, static_cast<T>(0));
+        size_t copy_size = std::min(inBatchSize, outBatchSize) * vecSize;
+        for(size_t i = 0; i < nLayers; i++)
+            std::copy_n(data.begin() + i * inBatchSize * vecSize,
+                        copy_size,
+                        ret.begin() + i * outBatchSize * vecSize);
+        return ret;
+    }
+
+    const miopen::RNNDescriptor rnnDesc{};
+
+    bool dirMode{};
+    bool biasMode{};
+    bool inputMode{};
+    bool useDropout{};
+    miopenRNNMode_t rnnMode;
+    size_t hiddenLayers, hidVec, outVec, inVec;
+    size_t reserveSpaceSizeCpu, workSpaceSizeCpu;
+};
+
+template <class T>
+struct cpu_rnn_universal_ref : rnn_ref<T>
+{
+    using typename rnn_ref<T>::FwdResult;
+    using typename rnn_ref<T>::BwdResult;
+    using typename rnn_ref<T>::WrwResult;
+
+    cpu_rnn_universal_ref(const miopen::RNNDescriptor& rnn,
+                          const miopen::SeqTensorDescriptor& maxX,
+                          const miopen::TensorDescriptor& hPackedDesc)
+        : packed_ref(ConstructPackedRNNRef(rnn, maxX, hPackedDesc)), hiddenDesc(hPackedDesc)
+    {
+    }
+
+    size_t getReserveSpaceSize() const override { return packed_ref.getReserveSpaceSize(); }
+
+    size_t getWorkSpaceSize() const override { return packed_ref.getWorkSpaceSize(); }
+
+    FwdResult fwd(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor& yDesc,
+                  const std::vector<T>& xData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& cxData,
+                  const std::vector<T>& wData,
+                  std::vector<T>& reserveSpace,
+                  bool nohx,
+                  bool nocx,
+                  bool nohy,
+                  bool nocy) const override
+    {
+        if(xDesc.IsPacked() && miopenRNNDataSeqMajorNotPadded ==
+                                   miopen::RNNDescriptor::getBaseLayoutFromDataTensor(xDesc))
+        {
+            return packed_ref.fwd(
+                xDesc, yDesc, xData, hxData, cxData, wData, reserveSpace, nohx, nocx, nohy, nocy);
+        }
+        else
+        {
+            auto converted_seq_order =
+                GetSamplesIndexDescendingOrder(xDesc.GetSequenceLengthsVector());
+
+            // IO
+            ////////////////////////////////////////////////////////////////
+            seqTensor<T> x_tensor_converted(GetSeqDescriptorLayoutTransform(
+                xDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            const miopen::SeqTensorDescriptor y_converted_desc(GetSeqDescriptorLayoutTransform(
+                yDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            TransformRNNIOLayaoutToTarget(xDesc,
+                                          x_tensor_converted.desc,
+                                          converted_seq_order,
+                                          xData,
+                                          x_tensor_converted.data);
+            ///////////////////////////////////////////////////////////////
+
+            auto& hid = hiddenDesc.GetLengths();
+
+            std::vector<T> hxData_converted{};
+            if(!nohx)
+            {
+                hxData_converted.resize(hxData.size());
+                HiddenTensorReorder(hxData, hxData_converted, converted_seq_order, hid, true);
+            }
+
+            std::vector<T> cxData_converted{};
+            if(!nocx)
+            {
+                cxData_converted.resize(cxData.size());
+                HiddenTensorReorder(cxData, cxData_converted, converted_seq_order, hid, true);
+            }
+
+            auto packed_res = packed_ref.fwd(x_tensor_converted.desc,
+                                             y_converted_desc,
+                                             x_tensor_converted.data,
+                                             hxData_converted,
+                                             cxData_converted,
+                                             wData,
+                                             reserveSpace,
+                                             nohx,
+                                             nocx,
+                                             nohy,
+                                             nocy);
+
+            std::vector<int> reverse_order = GetReverseOrderIndex(converted_seq_order);
+            // IO
+            ////////////////////////////////////////////////////////////////
+            seqTensor<T> y_tensor(yDesc);
+
+            TransformRNNIOLayaoutToTarget(
+                y_converted_desc, y_tensor.desc, reverse_order, packed_res.y, y_tensor.data);
+            ////////////////////////////////////////////////////////////////
+
+            std::vector<T> hyData_converted{};
+            if(!nohy)
+            {
+                hyData_converted.resize(packed_res.hy.size());
+                HiddenTensorReorder(packed_res.hy, hyData_converted, reverse_order, hid, true);
+            }
+
+            std::vector<T> cyData_converted{};
+            if(!nocy)
+            {
+                cyData_converted.resize(packed_res.cy.size());
+                HiddenTensorReorder(packed_res.cy, cyData_converted, reverse_order, hid, true);
+            }
+
+            return {y_tensor.data, hyData_converted, cyData_converted, nohy, nocy};
+        }
+    }
+
+    BwdResult bwd(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor& yDesc,
+                  const std::vector<T>& dyData,
+                  const std::vector<T>& dhyData,
+                  const std::vector<T>& dcyData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& cxData,
+                  const std::vector<T>& weiData,
+                  std::vector<T>& reserveSpace,
+                  std::vector<T>& workSpace,
+                  bool nodhx,
+                  bool nodcx,
+                  bool nodhy,
+                  bool nodcy,
+                  bool nohx,
+                  bool nocx) const override
+    {
+        if(xDesc.IsPacked() && miopenRNNDataSeqMajorNotPadded ==
+                                   miopen::RNNDescriptor::getBaseLayoutFromDataTensor(xDesc))
+        {
+            return packed_ref.bwd(xDesc,
+                                  yDesc,
+                                  dyData,
+                                  dhyData,
+                                  dcyData,
+                                  hxData,
+                                  cxData,
+                                  weiData,
+                                  reserveSpace,
+                                  workSpace,
+                                  nodhx,
+                                  nodcx,
+                                  nodhy,
+                                  nodcy,
+                                  nohx,
+                                  nocx);
+        }
+        else
+        {
+            auto converted_seq_order =
+                GetSamplesIndexDescendingOrder(xDesc.GetSequenceLengthsVector());
+
+            // IO
+            ////////////////////////////////////////////////////////////////
+            seqTensor<T> dy_tensor_converted(GetSeqDescriptorLayoutTransform(
+                yDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            const miopen::SeqTensorDescriptor x_converted_desc(GetSeqDescriptorLayoutTransform(
+                xDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            TransformRNNIOLayaoutToTarget(yDesc,
+                                          dy_tensor_converted.desc,
+                                          converted_seq_order,
+                                          dyData,
+                                          dy_tensor_converted.data);
+            ///////////////////////////////////////////////////////////////
+            auto& hid = hiddenDesc.GetLengths();
+
+            std::vector<T> dhyData_converted{};
+            if(!nodhy)
+            {
+                dhyData_converted.resize(dhyData.size());
+                HiddenTensorReorder(dhyData, dhyData_converted, converted_seq_order, hid, true);
+            }
+
+            std::vector<T> dcyData_converted{};
+            if(!nodcy)
+            {
+                dcyData_converted.resize(dcyData.size());
+                HiddenTensorReorder(dcyData, dcyData_converted, converted_seq_order, hid, true);
+            }
+
+            std::vector<T> cxData_converted{};
+            if(!nocx)
+            {
+                cxData_converted.resize(cxData.size());
+                HiddenTensorReorder(cxData, cxData_converted, converted_seq_order, hid, true);
+            }
+
+            std::vector<T> hxData_converted{};
+            if(!nohx)
+            {
+                hxData_converted.resize(hxData.size());
+                HiddenTensorReorder(hxData, hxData_converted, converted_seq_order, hid, true);
+            }
+
+            auto packed_res                = packed_ref.bwd(x_converted_desc,
+                                             dy_tensor_converted.desc,
+                                             dy_tensor_converted.data,
+                                             dhyData_converted,
+                                             dcyData_converted,
+                                             hxData_converted,
+                                             cxData_converted,
+                                             weiData,
+                                             reserveSpace,
+                                             workSpace,
+                                             nodhx,
+                                             nodcx,
+                                             nodhy,
+                                             nodcy,
+                                             nohx,
+                                             nocx);
+            std::vector<int> reverse_order = GetReverseOrderIndex(converted_seq_order);
+
+            // IO
+            ////////////////////////////////////////////////////////////////
+            seqTensor<T> dx_tensor(xDesc);
+
+            TransformRNNIOLayaoutToTarget(
+                x_converted_desc, dx_tensor.desc, reverse_order, packed_res.din, dx_tensor.data);
+            ////////////////////////////////////////////////////////////////
+
+            std::vector<T> dhxData_converted{};
+            if(!nodhx)
+            {
+                dhxData_converted.resize(packed_res.dhx.size());
+                HiddenTensorReorder(packed_res.dhx, dhxData_converted, reverse_order, hid, true);
+            }
+
+            std::vector<T> dcxData_converted{};
+            if(!nodcx)
+            {
+                dcxData_converted.resize(packed_res.dcx.size());
+                HiddenTensorReorder(packed_res.dcx, dcxData_converted, reverse_order, hid, true);
+            }
+
+            return {dx_tensor.data, dhxData_converted, dcxData_converted, nodhx, nodcx};
+        }
+    }
+
+    WrwResult wrw(const miopen::SeqTensorDescriptor& xDesc,
+                  const miopen::SeqTensorDescriptor& dyDesc,
+                  const std::vector<T>& xData,
+                  const std::vector<T>& hxData,
+                  const std::vector<T>& dyData,
+                  std::vector<T>& reserveSpace,
+                  std::vector<T>& workSpace,
+                  bool nohx) const override
+    {
+        if(xDesc.IsPacked() && miopenRNNDataSeqMajorNotPadded ==
+                                   miopen::RNNDescriptor::getBaseLayoutFromDataTensor(xDesc))
+        {
+            return packed_ref.wrw(
+                xDesc, dyDesc, xData, hxData, dyData, reserveSpace, workSpace, nohx);
+        }
+        else
+        {
+            auto converted_seq_order =
+                GetSamplesIndexDescendingOrder(xDesc.GetSequenceLengthsVector());
+
+            // IO
+            ////////////////////////////////////////////////////////////////
+            seqTensor<T> x_tensor_converted(GetSeqDescriptorLayoutTransform(
+                xDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            seqTensor<T> dy_tensor_converted(GetSeqDescriptorLayoutTransform(
+                dyDesc, miopenRNNDataSeqMajorNotPadded, converted_seq_order));
+
+            TransformRNNIOLayaoutToTarget(xDesc,
+                                          x_tensor_converted.desc,
+                                          converted_seq_order,
+                                          xData,
+                                          x_tensor_converted.data);
+
+            TransformRNNIOLayaoutToTarget(dyDesc,
+                                          dy_tensor_converted.desc,
+                                          converted_seq_order,
+                                          dyData,
+                                          dy_tensor_converted.data);
+            ///////////////////////////////////////////////////////////////
+            auto& hid = hiddenDesc.GetLengths();
+
+            std::vector<T> hxData_converted{};
+            if(!nohx)
+            {
+                hxData_converted.resize(hxData.size());
+                HiddenTensorReorder(hxData, hxData_converted, converted_seq_order, hid, true);
+            }
+
+            auto packed_res = packed_ref.wrw(x_tensor_converted.desc,
+                                             dy_tensor_converted.desc,
+                                             x_tensor_converted.data,
+                                             hxData_converted,
+                                             dy_tensor_converted.data,
+                                             reserveSpace,
+                                             workSpace,
+                                             nohx);
+
+            return packed_res;
+        }
+    }
+
+private:
+    cpu_rnn_packed_ref<T> packed_ref;
+    const miopen::TensorDescriptor hiddenDesc;
+
+    inline cpu_rnn_packed_ref<T>
+    ConstructPackedRNNRef(const miopen::RNNDescriptor& rnn,
+                          const miopen::SeqTensorDescriptor& maxX,
+                          const miopen::TensorDescriptor& hPackedDesc) const
+    {
+        if(maxX.IsPacked() && miopenRNNDataSeqMajorNotPadded ==
+                                  miopen::RNNDescriptor::getBaseLayoutFromDataTensor(maxX))
+        {
+            return cpu_rnn_packed_ref<T>{rnn, maxX, hPackedDesc};
+        }
+        else
+        {
+            return cpu_rnn_packed_ref<T>{
+                rnn,
+                GetSeqDescriptorLayoutTransform(
+                    maxX,
+                    miopenRNNDataSeqMajorNotPadded,
+                    GetSamplesIndexDescendingOrder(maxX.GetSequenceLengthsVector())),
+                hPackedDesc};
+        }
+    }
+};
+
+template <class T>
+struct verify_train_rnn : verify_rnn_api_base<T>
+{
+    using verify_rnn_api_base<T>::input;
+    using verify_rnn_api_base<T>::output;
+    using verify_rnn_api_base<T>::xHiddenState;
+    using verify_rnn_api_base<T>::xCellState;
+    using verify_rnn_api_base<T>::weights;
+    using verify_rnn_api_base<T>::rnnDesc;
+
+    using verify_rnn_api_base<T>::nohx;
+    using verify_rnn_api_base<T>::nocx;
+    using verify_rnn_api_base<T>::nohy;
+    using verify_rnn_api_base<T>::nocy;
+
+    bool nodhx{};
+    bool nodcx{};
+    bool nodhy{};
+    bool nodcy{};
+
+    using verify_rnn_api_base<T>::is_padded_verification;
+
+    using verify_rnn_api_base<T>::padding_symbol;
+
+    tensor<T> dyHiddenState{};
+    tensor<T> dyCellState{};
+    seqTensor<T> dOutput{};
+
+    using VerificationObj = std::tuple<std::vector<T>,
+                                       std::vector<T>,
+                                       std::vector<T>,
+                                       std::vector<T>,
+                                       std::vector<T>,
+                                       std::vector<T>,
+                                       std::vector<T>>;
+
+    VerificationObj result_tuple(const std::vector<T> fwd_y,
+                                 const std::vector<T> fwd_hy,
+                                 const std::vector<T> fwd_cy,
+                                 const std::vector<T> bwd_din,
+                                 const std::vector<T> bwd_dhx,
+                                 const std::vector<T> bwd_dcx,
+                                 const std::vector<T> wrw_dwei) const
+    {
+        return std::make_tuple(std::move(fwd_y),
+                               std::move(fwd_hy),
+                               std::move(fwd_cy),
+                               std::move(bwd_din),
+                               std::move(bwd_dhx),
+                               std::move(bwd_dcx),
+                               std::move(wrw_dwei));
+    }
+
+    verify_train_rnn(miopen::RNNDescriptor& pRD,
+                     seqTensor<T>& x,
+                     seqTensor<T>& y,
+                     seqTensor<T>& dy,
+                     tensor<T>& hx,
+                     tensor<T>& cx,
+                     tensor<T>& dhy,
+                     tensor<T>& dcy,
+                     std::vector<T>& w,
+                     const bool pnohx = false,
+                     const bool pnocx = false,
+                     const bool pnohy = false,
+                     const bool pnocy = false,
+                     T* paddingSymbol = nullptr)
+        : verify_rnn_api_base<T>(pRD, x, y, hx, cx, w, pnohx, pnocx, pnohy, pnocy, paddingSymbol),
+          dyHiddenState(dhy),
+          dyCellState(dcy),
+          dOutput(dy)
+    {
+        nodhx = nohx;
+        nodcx = nocx;
+        nodhy = nohy;
+        nodcy = nocy;
+    }
+
+    VerificationObj cpu() const
+    {
+        // auto&& handle = get_handle();
+
+        cpu_rnn_universal_ref<T> refMethod{rnnDesc, input.desc, xHiddenState.desc};
+
+        std::vector<T> reserve_space(refMethod.getReserveSpaceSize());
+        std::vector<T> work_space(refMethod.getWorkSpaceSize());
+
+        auto [fwd_y, fwd_hy, fwd_cy] = refMethod.fwd(input.desc,
+                                                     output.desc,
+                                                     input.data,
+                                                     xHiddenState.data,
+                                                     xCellState.data,
+                                                     weights,
+                                                     reserve_space,
+                                                     nohx,
+                                                     nocx,
+                                                     nohy,
+                                                     nocy);
+
+        auto [bwd_din, bwd_dhx, bwd_dcx] = refMethod.bwd(input.desc,
+                                                         output.desc,
+                                                         dOutput.data,
+                                                         dyHiddenState.data,
+                                                         dyCellState.data,
+                                                         xHiddenState.data,
+                                                         xCellState.data,
+                                                         weights,
+                                                         reserve_space,
+                                                         work_space,
+                                                         nodhx,
+                                                         nodcx,
+                                                         nodhy,
+                                                         nodcy,
+                                                         nohx,
+                                                         nocx);
+
+        auto wrw_res = refMethod.wrw(input.desc,
+                                     output.desc,
+                                     input.data,
+                                     xHiddenState.data,
+                                     dOutput.data,
+                                     reserve_space,
+                                     work_space,
+                                     nohx);
+
+        // if(is_padded_verification)
+        //{
+        //    std::fill(output_seq.begin(), output_seq.end(), padding_symbol);
+        //    ChangeDataPadding(*packed_output, output_seq, batch_seq, batch_seq[0], out_vec, true);
+        //}
+
+        return result_tuple(std::move(fwd_y),
+                            std::move(fwd_hy),
+                            std::move(fwd_cy),
+                            std::move(bwd_din),
+                            std::move(bwd_dhx),
+                            std::move(bwd_dcx),
+                            std::move(wrw_res.dwei));
+    }
+
+    VerificationObj gpu() const
+    {
+        auto&& handle = get_handle();
+
+        size_t workSpaceByteSize =
+            rnnDesc.GetMaxWorkspaceSize(handle, input.desc, miopenRNNFWDMode_t::miopenRNNTraining);
+
+        size_t reserveSpaceByteSize = rnnDesc.GetMaxReserveSize(handle, input.desc);
+
+        auto workSpace_dev    = handle.Create(workSpaceByteSize);
+        auto reserveSpace_dev = handle.Create(reserveSpaceByteSize);
+
+        auto x_dev  = transferTensorToGPUOrNullptr(handle, input, false);
+        auto hx_dev = transferTensorToGPUOrNullptr(handle, xHiddenState, nohx);
+        auto cx_dev = transferTensorToGPUOrNullptr(handle, xCellState, nocx);
+
+        auto y_dev  = createTensorAtGPUOrNullptr(handle, output, false);
+        auto hy_dev = createTensorAtGPUOrNullptr(handle, xHiddenState, nohy);
+        auto cy_dev = createTensorAtGPUOrNullptr(handle, xCellState, nocy);
+
+        auto weights_dev = handle.Write(weights);
+
+        rnnDesc.RNNForward(handle,
+                           miopenRNNFWDMode_t::miopenRNNTraining,
+                           input.desc,
+                           x_dev.get(),
+                           xHiddenState.desc,
+                           hx_dev.get(),
+                           hy_dev.get(),
+                           xCellState.desc, // cdesc
+                           cx_dev.get(),
+                           cy_dev.get(),
+                           output.desc,
+                           y_dev.get(),
+                           weights_dev.get(),
+                           weights.size() * sizeof(T),
+                           workSpace_dev.get(),
+                           workSpaceByteSize,
+                           reserveSpace_dev.get(),
+                           reserveSpaceByteSize);
+
+        size_t workSpace_TCnt    = workSpaceByteSize / sizeof(T);
+        size_t reserveSpace_TCnt = (reserveSpaceByteSize + sizeof(T) - 1) / sizeof(T);
+
+        std::vector<T> reserveSpace_fwd_out(reserveSpace_TCnt);
+        handle.ReadTo(
+            reserveSpace_fwd_out.data(),
+            reserveSpace_dev,
+            reserveSpaceByteSize); // std::copy(reserveSpace.begin(), reserveSpace.end(), RSVgpu);
+
+        const auto fwd_y  = handle.Read<T>(y_dev, output.GetSize());
+        const auto fwd_hy = readTFromGPUOrEmpty(handle, hy_dev, xHiddenState, nohy);
+        const auto fwd_cy = readTFromGPUOrEmpty(handle, cy_dev, xCellState, nocy);
+
+        const auto dy_dev  = transferTensorToGPUOrNullptr(handle, dOutput, false);
+        const auto dhy_dev = transferTensorToGPUOrNullptr(handle, dyHiddenState, nodhy);
+        const auto dcy_dev = transferTensorToGPUOrNullptr(handle, dyCellState, nodcy);
+
+        auto din_dev = createTensorAtGPUOrNullptr(handle, input, false);
+        auto dhx_dev = createTensorAtGPUOrNullptr(handle, xHiddenState, nodhx);
+        auto dcx_dev = createTensorAtGPUOrNullptr(handle, xCellState, nodcx);
+
+        const auto tmp_din = handle.Read<T>(din_dev, input.GetSize());
+
+        rnnDesc.RNNBackwardData(handle,
+                                dOutput.desc,
+                                nullptr,
+                                dy_dev.get(),
+                                xHiddenState.desc,
+                                hx_dev.get(),
+                                dhy_dev.get(),
+                                dhx_dev.get(),
+                                xCellState.desc,
+                                cx_dev.get(),
+                                dcy_dev.get(),
+                                dcx_dev.get(),
+                                input.desc,
+                                din_dev.get(),
+                                weights_dev.get(),
+                                weights.size() * sizeof(T),
+                                workSpace_dev.get(),
+                                workSpaceByteSize,
+                                reserveSpace_dev.get(),
+                                reserveSpaceByteSize);
+
+        const auto bwd_din = handle.Read<T>(din_dev, input.GetSize());
+        const auto bwd_dhx = readTFromGPUOrEmpty(handle, dhx_dev, xHiddenState, nodhx);
+        const auto bwd_dcx = readTFromGPUOrEmpty(handle, dcx_dev, xCellState, nodcx);
+
+        std::vector<T> workSpace_bwd_out(workSpace_TCnt);
+        handle.ReadTo(workSpace_bwd_out.data(), workSpace_dev, workSpaceByteSize);
+
+        auto dweights_dev = handle.Create(weights.size() * sizeof(T));
+
+        rnnDesc.RNNBackwardWeights(handle,
+                                   input.desc,
+                                   x_dev.get(),
+                                   xHiddenState.desc,
+                                   hx_dev.get(),
+                                   output.desc,
+                                   dy_dev.get(),
+                                   dweights_dev.get(),
+                                   weights.size() * sizeof(T),
+                                   workSpace_dev.get(),
+                                   workSpaceByteSize,
+                                   reserveSpace_dev.get(),
+                                   reserveSpaceByteSize);
+
+        const auto wrw_dwei = handle.Read<T>(dweights_dev, weights.size());
+
+        // if(!is_padded_verification)
+        //{
+        //    MIOPEN_THROW("TODO.");
+        //    return result_tuple(fwd_y, fwd_hy, fwd_cy, bwd_din, bwd_dhx, bwd_dcx, wrw_dwei);
+        //}
+
+        return result_tuple(fwd_y, fwd_hy, fwd_cy, bwd_din, bwd_dhx, bwd_dcx, wrw_dwei);
+    }
+};
+
+//****************************************************
+// RNN inference fwd
+//****************************************************
+
+template <class T>
+struct verify_inference_rnn : verify_rnn_api_base<T>
+{
+
+    using verify_rnn_api_base<T>::input;
+    using verify_rnn_api_base<T>::output;
+    using verify_rnn_api_base<T>::xHiddenState;
+    using verify_rnn_api_base<T>::xCellState;
+    using verify_rnn_api_base<T>::weights;
+    using verify_rnn_api_base<T>::rnnDesc;
+    using verify_rnn_api_base<T>::nohx;
+    using verify_rnn_api_base<T>::nocx;
+    using verify_rnn_api_base<T>::nohy;
+    using verify_rnn_api_base<T>::nocy;
+    using verify_rnn_api_base<T>::is_padded_verification;
+
+    using verify_rnn_api_base<T>::padding_symbol;
+
+    tensor<T> dyHiddenState{};
+    tensor<T> dyCellState{};
+    seqTensor<T> dOutput{};
+
+    using VerificationObj = std::tuple<std::vector<T>, std::vector<T>, std::vector<T>>;
+
+    VerificationObj
+    result_tuple(std::vector<T>&& fwd_y, std::vector<T>&& fwd_hy, std::vector<T>&& fwd_cy)
+    {
+        return std::make_tuple(fwd_y, fwd_hy, fwd_cy);
+    }
+
+    VerificationObj gpu() const
+    {
+        auto&& handle = get_handle();
+
+        size_t workSpaceByteSize = 0;
+
+        miopenGetRNNTempSpaceSizes(&handle,
+                                   &rnnDesc,
+                                   &input.desc,
+                                   miopenRNNFWDMode_t::miopenRNNInference,
+                                   &workSpaceByteSize,
+                                   nullptr);
+
+        auto workSpace_dev = handle.Create(workSpaceByteSize);
+
+        auto x_dev  = transferTensorToGPUOrNullptr(handle, input, false);
+        auto hx_dev = transferTensorToGPUOrNullptr(handle, xHiddenState, nohx);
+        auto cx_dev = transferTensorToGPUOrNullptr(handle, xCellState, nocx);
+
+        auto y_dev  = createTensorAtGPUOrNullptr(handle, output, false);
+        auto hy_dev = createTensorAtGPUOrNullptr(handle, xHiddenState, nohy);
+        auto cy_dev = createTensorAtGPUOrNullptr(handle, xCellState, nocy);
+
+        auto weights_dev = handle.Write(weights);
+
+        miopenRNNForward(&handle,
+                         &rnnDesc,
+                         miopenRNNFWDMode_t::miopenRNNInference,
+                         &input.desc,
+                         x_dev.get(),
+                         &xHiddenState.desc,
+                         hx_dev.get(),
+                         hy_dev.get(),
+                         &xCellState.desc,
+                         cx_dev.get(),
+                         cy_dev.get(),
+                         &output.desc,
+                         y_dev.get(),
+                         weights_dev.get(),
+                         weights.size() * sizeof(T),
+                         workSpace_dev.get(),
+                         workSpaceByteSize,
+                         nullptr,
+                         0);
+
+        const auto fwd_y  = handle.Read<T>(y_dev, output.GetSize());
+        const auto fwd_hy = readTFromGPUOrEmpty<T>(handle, hy_dev, xHiddenState.GetSize(), nohy);
+        const auto fwd_cy = readTFromGPUOrEmpty<T>(handle, cy_dev, xCellState.GetSize(), nocy);
+
+        return result_tuple(fwd_y, fwd_hy, fwd_cy);
+    }
+
+    size_t total_GPU_mem_size() {}
+    size_t input_GPU_mem_size() {}
+    size_t output_GPU_mem_size() {}
+    size_t workspace_GPU_mem_size() {}
+    size_t reservspace_GPU_mem_size() {}
+};
+
+template <class T>
+constexpr seqTensor<T> build_RNN_seqTensor(miopenRNNBaseLayout_t layout,
+                                           int batchSize,
+                                           int maxSequenceLen,
+                                           int vectorSize,
+                                           std::vector<int>& sequenceLenArray,
+                                           void* paddingMarker = nullptr)
+{
+    return {miopen::RNNDescriptor::makeSeqTensorDescriptor(miopen_type<T>{},
+                                                           layout,
+                                                           maxSequenceLen,
+                                                           batchSize,
+                                                           vectorSize,
+                                                           sequenceLenArray.data(),
+                                                           paddingMarker)};
+}
+
+constexpr miopenRNNBaseLayout_t rnn_data_layout(int io_layout)
+{
+    switch(io_layout)
+    {
+    case 1: return miopenRNNDataSeqMajorNotPadded;
+    case 2: return miopenRNNDataSeqMajorPadded;
+    case 3: return miopenRNNDataBatchMajorPadded;
+    default: MIOPEN_THROW("Incorrect input, unsupported RNNLayout.");
+    }
+}
+
+inline size_t get_RNN_params_byteSize(miopen::Handle& handle,
+                                      miopen::RNNDescriptor& rnnDesc,
+                                      miopen::SeqTensorDescriptor& inTensor)
+{
+    auto& in_lens                     = inTensor.GetLengths();
+    const std::vector<size_t> in_dims = {in_lens[0], in_lens[2]};
+    miopen::TensorDescriptor baseInputDesc(rnnDesc.dataType, in_dims);
+    size_t wei_bytes = 0;
+
+    miopenGetRNNParamsSize(&handle, &rnnDesc, &baseInputDesc, &wei_bytes, rnnDesc.dataType);
+
+    return wei_bytes;
+}
+
+template <class T>
+struct rnn_seq_api_test_driver : test_driver
+{
+    std::vector<int> seqLenArray;
+    int seqLength{};
+    int inVecLen{};
+    int hiddenSize{};
+    int numLayers{};
+    int inputMode{};
+    int biasMode{};
+    int dirMode{};
+    int rnnMode{};
+    int algoMode{};
+    int batchSize{};
+    int useDropout{};
+    int io_layout{};
+
+    // Null pointer input
+    bool nohx{};
+    bool nocx{};
+    bool nohy{};
+    bool nocy{};
+
+    rnn_seq_api_test_driver() {}
+
+    bool check_GPU_mem_limit(miopen::Handle& handle,
+                             miopen::RNNDescriptor& rnnDesc,
+                             seqTensor<T>& input,
+                             seqTensor<T>& output,
+                             tensor<T>& hx,
+                             tensor<T>& cx,
+                             size_t weightsByteSize,
+                             size_t statesSizeInBytes)
+    {
+
+        size_t train_workSpace_size, train_reserveSpace_size;
+        miopenGetRNNTempSpaceSizes(&handle,
+                                   &rnnDesc,
+                                   &input.desc,
+                                   miopenRNNTraining,
+                                   &train_workSpace_size,
+                                   &train_reserveSpace_size);
+
+        size_t inference_workSpace_size;
+        miopenGetRNNTempSpaceSizes(
+            &handle, &rnnDesc, &input.desc, miopenRNNInference, &inference_workSpace_size, nullptr);
+
+        auto tmp_mem =
+            std::max(inference_workSpace_size, train_workSpace_size + train_reserveSpace_size);
+
+        size_t total_mem = statesSizeInBytes + tmp_mem +
+                           (2 * output.GetSize() + input.GetSize() + weightsByteSize +
+                            (nohx ? 0 : 2 * hx.GetSize()) + (nohy ? 0 : 2 * hx.GetSize()) +
+                            (nocx ? 0 : 2 * cx.GetSize()) + (nocy ? 0 : 2 * cx.GetSize())) *
+                               sizeof(T);
+
+        size_t device_mem = handle.GetGlobalMemorySize();
+
+        if(total_mem >= device_mem)
+        {
+            show_command();
+            std::cout << "Config requires " << total_mem
+                      << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
+                      << " Bytes of memory." << std::endl;
+            return false;
+        }
+        return true;
+    }
+
+    void fill_buffers(seqTensor<T>& input,
+                      seqTensor<T>& dy,
+                      tensor<T>& hx,
+                      tensor<T>& cx,
+                      tensor<T>& dhy,
+                      tensor<T>& dcy,
+                      std::vector<T>& weights)
+    {
+        const double scale     = 0.1;
+        const double bwd_scale = scale;
+
+        struct scalar_gen_random_float
+        {
+            double min_val = 0;
+            double max_val = 1;
+
+            double operator()() const
+            {
+                return prng::gen_A_to_B(static_cast<T>(min_val), static_cast<T>(max_val));
+            }
+        };
+
+        auto gen_positive_value = [=](auto...) {
+            return scalar_gen_random_float{std::numeric_limits<T>::epsilon(), 1 * scale}();
+        };
+
+        auto gen_positive_value_bwd = [=](auto...) {
+            double bwd_max = 1. * scale;
+            double bwd_min = std::numeric_limits<T>::epsilon();
+            return scalar_gen_random_float{bwd_min, bwd_max}();
+        };
+
+        auto fill_array_via_gen = [](auto& dst, size_t dst_sz, double range_l, double range_r) {
+            for(size_t it = 0; it < dst_sz; it++)
+                dst[it] = prng::gen_A_to_B(static_cast<T>(range_l), static_cast<T>(range_r));
+        };
+        prng::reset_seed();
+        fill_array_via_gen(
+            input.data, input.data.size(), std::numeric_limits<T>::epsilon(), 1. * scale);
+        prng::reset_seed();
+        fill_array_via_gen(
+            dy.data, dy.data.size(), std::numeric_limits<T>::epsilon(), 1. * bwd_scale);
+        prng::reset_seed();
+
+        const auto hidden_size = hx.desc.GetLengths()[2];
+        const double wei_range = sqrt(1. / hidden_size);
+        fill_array_via_gen(weights, weights.size(), -wei_range, wei_range);
+
+        if(!nohx)
+        {
+            hx.generate(gen_positive_value);
+        }
+
+        if(!nocx)
+        {
+            cx.generate(gen_positive_value);
+        }
+
+        if(!nohy)
+        {
+            dhy.generate(gen_positive_value_bwd);
+        }
+
+        if(!nocy)
+        {
+            dcy.generate(gen_positive_value_bwd);
+        }
+    }
+
+    void args_update()
+    {
+
+        if(io_layout == 1 && (!seqLenArray.empty()) &&
+           (!std::is_sorted(seqLenArray.begin(), seqLenArray.end(), std::greater<>{})))
+        {
+            MIOPEN_THROW("Incorrect input, seq_lens should not to increase with "
+                         "miopenRNNDataSeqMajorNotPadded layout\n");
+        }
+
+        if(!seqLenArray.empty())
+        {
+            if(seqLenArray.size() < batchSize)
+            {
+
+                int padding_val = 0;
+                printf("sampl_lens size == %zu is shmaller than time batch_size == %d, padding the "
+                       "rest "
+                       "of data with %d\n",
+                       seqLenArray.size(),
+                       batchSize,
+                       padding_val);
+
+                std::vector<int> new_seqLenArray(batchSize);
+
+                std::copy_n(seqLenArray.begin(), seqLenArray.size(), new_seqLenArray.begin());
+                std::fill_n(new_seqLenArray.begin() + seqLenArray.size(),
+                            batchSize - seqLenArray.size(),
+                            padding_val);
+                seqLenArray = new_seqLenArray;
+            }
+            size_t seq_max_element = *std::max_element(seqLenArray.begin(), seqLenArray.end());
+
+            if(seqLength < seq_max_element)
+                MIOPEN_THROW(
+                    "Incorrect input, seq_lens elements should be smaller or equal to seqLength\n");
+        }
+        else
+        {
+            printf("Empty batch sequence. Filling uniformly with max_seqLength:%d\n ", seqLength);
+            seqLenArray.resize(batchSize, seqLength);
+        }
+    }
+
+    void run()
+    {
+        args_update();
+
+        auto&& handle = get_handle();
+
+        miopen::RNNDescriptor rnnDesc{};
+        miopen::DropoutDescriptor dropoutDesc{};
+
+        size_t statesSizeInBytes = 0;
+
+        if(useDropout != 0)
+        {
+            float dropout_rate              = 0.25;
+            unsigned long long dropout_seed = 0ULL;
+            miopenDropoutGetStatesSize(&handle, &statesSizeInBytes);
+
+            void* dropout_state_buf;
+            hipMalloc(static_cast<void**>(&dropout_state_buf), statesSizeInBytes);
+
+            miopenSetDropoutDescriptor(&dropoutDesc,
+                                       &handle,
+                                       dropout_rate,
+                                       dropout_state_buf,
+                                       statesSizeInBytes,
+                                       dropout_seed,
+                                       false,
+                                       false,
+                                       MIOPEN_RNG_PSEUDO_XORWOW);
+
+            miopenSetRNNDescriptor_V2(&rnnDesc,
+                                      hiddenSize,
+                                      numLayers,
+                                      &dropoutDesc,
+                                      miopenRNNInputMode_t(inputMode),
+                                      miopenRNNDirectionMode_t(dirMode),
+                                      miopenRNNMode_t(rnnMode),
+                                      miopenRNNBiasMode_t(biasMode),
+                                      miopenRNNAlgo_t(algoMode),
+                                      type);
+        }
+        else
+        {
+            miopenSetRNNDescriptor(&rnnDesc,
+                                   hiddenSize,
+                                   numLayers,
+                                   miopenRNNInputMode_t(inputMode),
+                                   miopenRNNDirectionMode_t(dirMode),
+                                   miopenRNNMode_t(rnnMode),
+                                   miopenRNNBiasMode_t(biasMode),
+                                   miopenRNNAlgo_t(algoMode),
+                                   type);
+        }
+
+        const auto inOut_layout   = rnn_data_layout(io_layout);
+        const auto out_vector_len = hiddenSize * ((dirMode != 0) ? 2 : 1);
+
+        T padding_m = static_cast<T>(0);
+
+        seqTensor<T> input = build_RNN_seqTensor<T>(
+                         inOut_layout, batchSize, seqLength, inVecLen, seqLenArray, &padding_m),
+                     output = build_RNN_seqTensor<T>(inOut_layout,
+                                                     batchSize,
+                                                     seqLength,
+                                                     out_vector_len,
+                                                     seqLenArray,
+                                                     &padding_m);
+        seqTensor<T> dy(output);
+
+        const auto num_hidden_layers = numLayers * ((dirMode != 0) ? 2 : 1);
+        tensor<T> hx(std::vector{num_hidden_layers, batchSize, hiddenSize});
+        tensor<T> cx(hx), dhy(hx), dcy(hx);
+
+        std::vector<T> weights(get_RNN_params_byteSize(handle, rnnDesc, input.desc) / sizeof(T));
+
+        check_GPU_mem_limit(
+            handle, rnnDesc, input, output, hx, cx, weights.size() * sizeof(T), statesSizeInBytes);
+
+        fill_buffers(input, dy, hx, cx, dhy, dcy, weights);
+
+        // avoid BWD unexpected fails
+        // https://github.com/ROCmSoftwarePlatform/MIOpen/pull/2493#discussion_r1406959588
+        if(inVecLen == 1 && hiddenSize == 13 && seqLength == 1 && batchSize == 1)
+        {
+            tolerance = 110;
+        }
+        else
+        {
+            tolerance = 80;
+        }
+
+        auto fwdTrain = verify(verify_train_rnn<T>{
+            rnnDesc, input, output, dy, hx, cx, dhy, dcy, weights, nohx, nocx, nohy, nocy});
+    }
+};
diff --git a/test/rnn_util.hpp b/test/rnn_util.hpp
index 9a227bde0c..680c52582f 100644
--- a/test/rnn_util.hpp
+++ b/test/rnn_util.hpp
@@ -34,10 +34,68 @@
 #include <vector>
 #include <cstdlib>
 #include "random.hpp"
+#include <numeric>
 
 #define RNN_MM_TRANSPOSE 1
 #define RNN_MM_USEPARAGEMM 0
 
+// complexity O(NlogN)
+inline std::vector<int> GetReverseOrderIndex(const std::vector<int>& base_index)
+{
+    std::vector<int> reverse_index(base_index.size());
+    unsigned next_rev_index = 0;
+    for(auto id : base_index)
+        reverse_index[id] = next_rev_index++;
+    return reverse_index;
+};
+
+inline std::vector<int> GetSamplesIndexDescendingOrder(const std::vector<size_t>& unsorted_seq_lens)
+{
+    const auto sample_count = unsorted_seq_lens.size();
+
+    std::vector<int> index_v(sample_count);
+    std::iota(index_v.begin(), index_v.end(), 0);
+
+    auto seq_len_cmp = [&unsorted_seq_lens](unsigned a_id, unsigned b_id) {
+        return unsorted_seq_lens[a_id] > unsorted_seq_lens[b_id];
+    };
+
+    std::stable_sort(index_v.begin(), index_v.end(), seq_len_cmp);
+
+    return index_v;
+}
+
+template <typename Tgpu>
+inline void HiddenTensorReorder(const std::vector<Tgpu>& src_array,
+                                std::vector<Tgpu>& dst_array,
+                                const std::vector<int>& batch_order,
+                                const std::vector<size_t> hid_len,
+                                bool is_dst_direct_order)
+{
+    const size_t copy_size = hid_len[2];
+
+    const size_t batch_stride = hid_len[2];
+    const size_t layer_stride = batch_stride * hid_len[1];
+
+    for(size_t batch_id = 0; batch_id < hid_len[1]; batch_id++)
+    {
+        const auto src_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_order[batch_id] : batch_id);
+        const auto dst_batch_off =
+            batch_stride * (is_dst_direct_order ? batch_id : batch_order[batch_id]);
+
+        for(size_t layer_id = 0; layer_id < hid_len[0]; layer_id++)
+        {
+            const auto dst_offset = dst_batch_off + layer_id * layer_stride;
+            const auto src_offset = src_batch_off + layer_id * layer_stride;
+
+            std::copy(src_array.begin() + src_offset,
+                      src_array.begin() + src_offset + copy_size,
+                      dst_array.begin() + dst_offset);
+        }
+    }
+}
+
 inline void createTensorDescArray(std::vector<miopen::TensorDescriptor>& td,
                                   std::vector<miopenTensorDescriptor_t>& ptd,
                                   const std::vector<int> bs,
diff --git a/test/seq_tensor_holder.hpp b/test/seq_tensor_holder.hpp
new file mode 100644
index 0000000000..645ca1bdae
--- /dev/null
+++ b/test/seq_tensor_holder.hpp
@@ -0,0 +1,289 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "tensor_holder.hpp"
+//#include "rnn_util.hpp"
+
+#include <miopen/seq_tensor.hpp>
+
+template <class DataT, class IndexT>
+std::vector<DataT> GetReorderedVector(const std::vector<DataT>& src,
+                                      const std::vector<IndexT>& resOrder)
+{
+    if(resOrder.size() != src.size())
+        MIOPEN_THROW("data and index size mismatch resOrder.size() != src.size()");
+
+    std::vector<DataT> reordered_result(src.size());
+    auto result_iter = reordered_result.begin();
+
+    for(auto i : resOrder)
+        *result_iter++ = src[i];
+
+    return reordered_result;
+}
+
+struct SeqTensorOffsets
+{
+    SeqTensorOffsets(const miopen::SeqTensorDescriptor& sDesc)
+        : major_dim_part_sum(partSum(sDesc)), desc(sDesc)
+    {
+    }
+
+    template <typename T>
+    size_t operator()(const T pos) const
+    {
+
+        size_t pos_offset = 0;
+
+        const auto& lens = desc.GetLengths();
+
+        if(desc.IsPaddedSeqLayout())
+        {
+            pos_offset = std::accumulate(desc.GetLayoutVector().begin(),
+                                         desc.GetLayoutVector().end(),
+                                         pos_offset,
+                                         [&lens, &pos](auto&& before, auto&& dim) {
+                                             return (before * lens[dim]) + pos[dim];
+                                         });
+        }
+        else
+        {
+            bool is_seq_part_begin            = true;
+            size_t saved_offset_before_seqDim = 0;
+            for(auto dim : desc.GetLayoutVector())
+            {
+                if(dim != 0 && dim != 1)
+                    pos_offset = (pos_offset * lens[dim]) + pos[dim];
+                else
+                {
+                    // each condition visited only once;
+                    if(is_seq_part_begin) // seq block begin - major dim (seq or batch)
+                    {
+                        saved_offset_before_seqDim =
+                            pos_offset * desc.GetTotalSequenceLen() + major_dim_part_sum[pos[dim]];
+                        pos_offset = 0;
+                    }
+                    else // seq block end - minor dim (seq or batch)
+                    {
+                        const int major_dim_from_minor[] = {1, 0};
+                        const auto major_dim_pos         = pos[major_dim_from_minor[dim]];
+
+                        pos_offset *= major_dim_part_sum[major_dim_pos + 1] -
+                                      major_dim_part_sum[major_dim_pos];
+
+                        // pos_offset += pos[dim] works only if desc.IsSequenceLengthsSorted() ==
+                        // true
+                        pos_offset += pos[dim];
+
+                        // add saved offset
+                        pos_offset += saved_offset_before_seqDim;
+                    }
+                    is_seq_part_begin = false;
+                }
+            }
+        }
+        return pos_offset;
+    }
+
+private:
+    std::vector<size_t> partSum(const miopen::SeqTensorDescriptor& sDesc) const
+    {
+        std::vector<size_t> sum_v{};
+
+        if(!sDesc.IsPaddedSeqLayout())
+        {
+            const auto& seq_array = sDesc.GetSequenceLengthsVector();
+            if(sDesc.GetLayoutVector()[0] == 0)
+            {
+                sum_v.resize(seq_array.size() + 1);
+                sum_v[0] = 0;
+                std::partial_sum(seq_array.begin(), seq_array.end(), std::next(sum_v.begin()));
+            }
+            else
+            {
+                if(!sDesc.IsSequenceLengthsSorted())
+                    MIOPEN_THROW(miopenStatusBadParm,
+                                 "Non-sorted SeqMajor tensor is not supported");
+
+                sum_v.resize(seq_array[0] + 1);
+                sum_v[0] = 0;
+                {
+                    std::vector<size_t> batch_per_seq(seq_array[0]);
+                    auto batchs_begin   = batch_per_seq.begin();
+                    size_t prew_seq_len = 0;
+                    for(auto seq_cnt = seq_array.size(); seq_cnt != 0; seq_cnt--)
+                    {
+                        const auto cur_seq_len = seq_array[seq_cnt - 1];
+                        const auto write_n     = cur_seq_len - prew_seq_len;
+                        if(write_n > 0)
+                        {
+                            std::fill_n(batchs_begin, write_n, seq_cnt);
+                            batchs_begin += write_n;
+                        }
+                        prew_seq_len = cur_seq_len;
+                    }
+                    std::partial_sum(
+                        batch_per_seq.begin(), batch_per_seq.end(), std::next(sum_v.begin()));
+                }
+            }
+        }
+
+        return sum_v;
+    }
+    const std::vector<size_t> major_dim_part_sum;
+    const miopen::SeqTensorDescriptor& desc;
+};
+
+template <typename T>
+void TransformRNNIOLayaoutToTarget(const miopen::SeqTensorDescriptor& srcDesc,
+                                   const miopen::SeqTensorDescriptor& dstDesc,
+                                   const std::vector<int>& srcToDstSeqMaping,
+                                   const std::vector<T>& srcData,
+                                   std::vector<T>& dstData)
+{
+    const auto& maxDimLengths = srcDesc.GetLengths();
+
+    assert(maxDimLengths.size() == 3 && srcDesc.GetLayoutVector()[2] == 2);
+    assert(maxDimLengths == dstDesc.GetLengths());
+
+    if(dstDesc.IsPaddingMarkerSpecified() && dstDesc.IsPaddedSeqLayout())
+    {
+        T paddingSymbol;
+
+        memcpy(&paddingSymbol, dstDesc.GetPaddingMarkerHolder().data(), sizeof(T));
+
+        std::fill(dstData.begin(), dstData.end(), paddingSymbol);
+    }
+
+    const auto& srcSeqLengths                  = srcDesc.GetSequenceLengthsVector();
+    [[maybe_unused]] const auto& dstSeqLengths = dstDesc.GetSequenceLengthsVector();
+
+    const size_t batch_size = maxDimLengths[0];
+    const size_t copy_size  = maxDimLengths[2]; // IO vector size
+
+    const SeqTensorOffsets src_offset_calc(srcDesc);
+    const SeqTensorOffsets dst_offset_calc(dstDesc);
+
+    for(size_t batch_it = 0; batch_it < batch_size; batch_it++)
+    {
+        const size_t src_batch_it = srcToDstSeqMaping[batch_it];
+        const size_t dst_batch_it = batch_it;
+
+        assert(dstSeqLengths[dst_batch_it] == srcSeqLengths[src_batch_it]);
+
+        for(size_t seqTime_it = 0; seqTime_it < srcSeqLengths[src_batch_it]; seqTime_it++)
+        {
+
+            const size_t src_offset =
+                src_offset_calc(std::array<size_t, 3>{src_batch_it, seqTime_it, 0});
+            const size_t dst_offset =
+                dst_offset_calc(std::array<size_t, 3>{dst_batch_it, seqTime_it, 0});
+
+            std::copy(&srcData[src_offset], &srcData[src_offset + copy_size], &dstData[dst_offset]);
+        }
+    }
+}
+
+inline miopen::SeqTensorDescriptor
+GetSeqDescriptorLayoutTransform(const miopen::SeqTensorDescriptor& desc,
+                                miopenRNNBaseLayout_t transformLayout,
+                                const std::vector<int>& transformSeqOrder)
+{
+    const auto [layout_dims_order, layout_seq_padding] =
+        miopen::RNNDescriptor::convertRNNBaseLayout(transformLayout);
+
+    const std::vector<size_t> transformed_seqLens =
+        GetReorderedVector(desc.GetSequenceLengthsVector(), transformSeqOrder);
+
+    return miopen::SeqTensorDescriptor{desc.GetType(),
+                                       layout_dims_order,
+                                       desc.GetLengths(),
+                                       transformed_seqLens,
+                                       desc.GetPaddingMarkerHolder(),
+                                       true,
+                                       layout_seq_padding};
+}
+
+template <class T>
+struct seqTensor
+{
+    miopen::SeqTensorDescriptor desc;
+
+    // private:
+    std::vector<T> data;
+    // public:
+
+    size_t GetDataByteSize() const
+    {
+        return data.empty() ? desc.GetTensorRealByteSpace() : data.size() * sizeof(T);
+    }
+
+    size_t GetSize() const { return desc.GetTensorRealByteSpace() / sizeof(T); }
+
+    std::vector<T>& GetDataPtr() const { return data.data(); }
+
+    seqTensor<T> GetTensorLayoutTransform(miopenRNNBaseLayout_t transformLayout,
+                                          std::vector<int>& transformSeqOrder) const
+    {
+        seqTensor<T> transformed_tensor(
+            GetSeqDescriptorLayoutTransform(desc, transformLayout, transformSeqOrder));
+
+        TransformRNNIOLayaoutToTarget(
+            desc, transformed_tensor.desc, transformSeqOrder, data, transformed_tensor.data);
+
+        return transformed_tensor;
+    }
+
+    // size_t GetNotPaddedDataCnt() { return desc.GetElementCount();}
+
+    seqTensor(const miopen::SeqTensorDescriptor& tensor_desc)
+        : desc(tensor_desc), data(desc.GetTensorRealByteSpace() / sizeof(T))
+    {
+    }
+
+    template <class X>
+    seqTensor(const std::vector<X>& dims)
+        : desc(miopen_type<T>{}, dims), data(desc.GetTensorRealByteSpace() / sizeof(T))
+    {
+    }
+
+    seqTensor(miopenDataType_t t,
+              miopenTensorLayout_t layout,
+              std::size_t batch,
+              std::size_t seq,
+              std::size_t vector_in)
+        : seqTensor(t, layout, {batch, seq, vector_in})
+    {
+    }
+
+    template <class X>
+    seqTensor(miopenDataType_t t, miopenTensorLayout_t layout, const std::vector<X>& dims)
+        : desc(t, layout, dims), data(desc.GetTensorRealByteSpace() / sizeof(T))
+    {
+    }
+};
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 6583983063..2363d32856 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -191,6 +191,10 @@ struct tensor
         data.resize(desc.GetElementSpace());
     }
 
+    size_t GetDataByteSize() const { return GetSize() * sizeof(T); }
+
+    size_t GetSize() const { return desc.GetElementSpace(); }
+
     template <class G>
     tensor& generate(G g) &
     {
diff --git a/test/verify.hpp b/test/verify.hpp
index 339eeeee76..d85a333144 100644
--- a/test/verify.hpp
+++ b/test/verify.hpp
@@ -194,9 +194,10 @@ template <class R1, class R2>
 double rms_range(R1&& r1, R2&& r2)
 {
     std::size_t n = range_distance(r1);
-    // When range is zero-sized, max_element() returns a past-the-end iterator.
-    if(n == range_distance(r2) && n != 0)
+    if(n == range_distance(r2))
     {
+        if(n == 0)
+            return 0;
         double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
         double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
         double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));