Sequence Operator Improvements (apache#9306)

* added axis parameter + refactored to use expression template * refactor of sequence last * add axis to sequence reverse * add axis support to sequence mask, rewrite kernels, fix bug for kAddTo * remove header * add rigorous tests for sequence ops * conflict * remove conflict * various sequence op fixes * added 2 spaces for top-level python functions to avoid PyCharm lint warning
yuxiangw · Jan 25, 2018 · 2e5fa09 · 2e5fa09
1 parent 28eb235
commit 2e5fa09
Show file tree

Hide file tree

Showing 4 changed files with 351 additions and 117 deletions.
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
@@ -45,16 +45,46 @@ namespace op {
 namespace seq_last {
 enum SequenceLastOpInputs { kData, kSequenceLength };
 enum SequenceLastOpOutputs { kOut };
+enum SequenceLastOpResource { kTempSpace };
 }
 
 struct SequenceLastParam : public dmlc::Parameter<SequenceLastParam> {
   bool use_sequence_length;
+  int axis;
   DMLC_DECLARE_PARAMETER(SequenceLastParam) {
     DMLC_DECLARE_FIELD(use_sequence_length)
         .set_default(false)
         .describe(
-            "If set to true, this layer takes in an extra input parameter `sequence_length` "
+            "If set to true, this layer takes in an extra input parameter "
+            "`sequence_length` "
             "to specify variable length sequence");
+    DMLC_DECLARE_FIELD(axis).set_default(0).describe(
+        "The sequence axis. Only values of 0 and 1 are currently supported.");
+  }
+};
+
+template <int req>
+struct SequenceLastKernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+                                  const DType *idx, int offset1, int offset2,
+                                  mshadow::Shape<2> oshape) {
+    const auto opos = mxnet_op::unravel(i, oshape);
+    const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
+    const int ipos = seqpos * offset1 + opos[0] * offset2 + opos[1];
+    KERNEL_ASSIGN(out[i], req, in[ipos]);
+  }
+};
+
+struct SequenceLastGradKernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *in_grad, const DType *out_grad,
+                                  const DType *idx, int offset1, int offset2,
+                                  mshadow::Shape<2> oshape) {
+    const auto opos = mxnet_op::unravel(i, oshape);
+    const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
+    const int ipos = seqpos * offset1 + opos[0] * offset2 + opos[1];
+    in_grad[ipos] += out_grad[i];
   }
 };
 
@@ -63,6 +93,47 @@ class SequenceLastOp : public Operator {
  public:
   explicit SequenceLastOp(SequenceLastParam p) { this->param_ = p; }
 
+  void sequence_last(const mshadow::Tensor<xpu, 3, DType> &data,
+                     const mshadow::Tensor<xpu, 2, DType> &out,
+                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const OpReqType req, mshadow::Stream<xpu> *const s) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    int axis = param_.axis;
+    int out_size = out.size(0) * out.size(1);
+    int max_seq_len = data.size(axis);
+    int offset1 = axis ? out.size(1) : out_size;
+    int offset2 = axis ? (max_seq_len * out.size(1)) : out.size(1);
+
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      mxnet_op::Kernel<SequenceLastKernel<req_type>, xpu>::Launch(
+          s, out_size, out.dptr_, data.dptr_, indices.dptr_, offset1, offset2,
+          out.shape_);
+    });
+  }
+
+  void sequence_last_grad(const mshadow::Tensor<xpu, 3, DType> &in_grad,
+                          const mshadow::Tensor<xpu, 2, DType> &out_grad,
+                          const mshadow::Tensor<xpu, 1, DType> &indices,
+                          mshadow::Stream<xpu> *const s) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    auto axis = param_.axis;
+    int batch = out_grad.size(0);
+    int rest = out_grad.size(1);
+    int out_size = batch * rest;
+
+    int max_seq_len = in_grad.size(axis);
+    int offset1 = axis ? rest : out_size;
+    int offset2 = axis ? (max_seq_len * rest) : rest;
+
+    mxnet_op::Kernel<SequenceLastGradKernel, xpu>::Launch(
+        s, out_size, in_grad.dptr_, out_grad.dptr_, indices.dptr_, offset1,
+        offset2, out_grad.shape_);
+  }
+
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -74,33 +145,32 @@ class SequenceLastOp : public Operator {
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
+    // only support axis of 0 or 1 for now
+    auto axis = param_.axis;
+
     // Get any size input + output into required form
-    index_t n = in_data[seq_last::kData].size(1);
-    int max_seq_len = in_data[seq_last::kData].size(0);
-    int total_size = in_data[seq_last::kData].Size();
-    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
-    Shape<3> s3 =
-        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+    auto d0 = in_data[seq_last::kData].size(0);
+    auto d1 = in_data[seq_last::kData].size(1);
+    auto dsize = in_data[seq_last::kData].Size();
+
+    auto batch = (axis != 0) ? d0 : d1;
+    auto max_seq_len = in_data[seq_last::kData].size(axis);
+    auto rest_size = dsize / (d0 * d1);
+
     Tensor<xpu, 3, DType> data =
-        in_data[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+        in_data[seq_last::kData].get_with_shape<xpu, 3, DType>(
+            Shape3(d0, d1, rest_size), s);
     Tensor<xpu, 2, DType> out =
-        out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
-
-    if (param_.use_sequence_length) {
-      std::vector<index_t> indices_vec(n, max_seq_len);
-      IndexTensorToVector(
-          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
-          &indices_vec);
-      if (req[seq_last::kOut] == kWriteTo) out = 0.0f;
-      index_t seq_ind;
-      for (index_t i = 0; i < n; ++i) {
-        seq_ind = indices_vec[i] - 1;  // 1-indexing
-        out[i] += data[seq_ind][i];
-      }
-    } else {
-      Assign(out, req[seq_last::kOut],
-             F<mshadow_op::identity>(data[max_seq_len - 1]));
-    }
+        out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(
+            Shape2(batch, rest_size), s);
+    Tensor<xpu, 1, DType> indices =
+        param_.use_sequence_length
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            : ctx.requested[seq_last::kTempSpace]
+                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+    if (!param_.use_sequence_length) indices = max_seq_len;
+
+    sequence_last(data, out, indices, req[seq_last::kOut], s);
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -119,33 +189,32 @@ class SequenceLastOp : public Operator {
     if (req[seq_last::kData] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    // only support axis of 0 or 1 for now
+    auto axis = param_.axis;
 
     // Get any size input + output into required form
-    index_t n = in_grad[seq_last::kData].size(1);
-    int max_seq_len = in_grad[seq_last::kData].size(0);
-    int total_size = in_grad[seq_last::kData].Size();
-    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
-    Shape<3> s3 =
-        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+    auto d0 = in_data[seq_last::kData].size(0);
+    auto d1 = in_data[seq_last::kData].size(1);
+    auto dsize = in_data[seq_last::kData].Size();
+
+    auto batch = (axis != 0) ? d0 : d1;
+    auto max_seq_len = in_data[seq_last::kData].size(axis);
+    auto rest_size = dsize / (d0 * d1);
 
     Tensor<xpu, 3, DType> data_grad =
-        in_grad[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+        in_grad[seq_last::kData].get_with_shape<xpu, 3, DType>(
+            Shape3(d0, d1, rest_size), s);
     Tensor<xpu, 2, DType> output_grad =
-        out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+        out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(
+            Shape2(batch, rest_size), s);
+    Tensor<xpu, 1, DType> indices =
+        param_.use_sequence_length
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            : ctx.requested[seq_last::kTempSpace]
+                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
 
-    // copy indices to vector
-    std::vector<index_t> indices_vec(n, max_seq_len);
-    if (param_.use_sequence_length)
-      IndexTensorToVector(
-          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
-          &indices_vec);
-
-    index_t seq_ind;
     if (req[seq_last::kData] == kWriteTo) data_grad = 0.0f;
-    for (index_t i = 0; i < n; ++i) {
-      seq_ind = indices_vec[i] - 1;
-      data_grad[seq_ind][i] += output_grad[i];
-    }
+    sequence_last_grad(data_grad, output_grad, indices, s);
   }
 
  private:
@@ -183,18 +252,21 @@ class SequenceLastProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
+    CHECK((param_.axis == 0) || (param_.axis == 1))
+        << "Current implementation expects axis to be 0 or 1.";
 
     const TShape &dshape = (*in_shape)[seq_last::kData];
     CHECK_GT(dshape.ndim(), 1U)
         << "The data array must be of rank 2 or greater.";
     // seq length vector is same as batch size
+    int sbatch = param_.axis ? dshape[0] : dshape[1];
     if (param_.use_sequence_length)
-      SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength,
-                         Shape1(dshape[1]));
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength, Shape1(sbatch));
 
     // calculate output size
     TShape shape_o(dshape.ndim() - 1);
-    for (index_t i = 0; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
+    shape_o[0] = sbatch;
+    for (index_t i = 1; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
 
     const TShape &oshape = shape_o;
     out_shape->clear();
@@ -227,6 +299,16 @@ class SequenceLastProp : public OperatorProperty {
 
   std::string TypeString() const override { return "SequenceLast"; }
 
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
   std::vector<int> DeclareBackwardDependency(
       const std::vector<int> &out_grad, const std::vector<int> &in_data,
       const std::vector<int> &out_data) const override {