From 9c7ef9e3cb54ea2f8676062d81050348ffe915df Mon Sep 17 00:00:00 2001
From: rongzha1 <rong.a.zhang@intel.com>
Date: Wed, 25 Dec 2019 16:12:08 +0800
Subject: [PATCH 1/4] add mkldnn softmax backward

---
 src/operator/nn/mkldnn/mkldnn_ops-inl.h  |  4 +++
 src/operator/nn/mkldnn/mkldnn_softmax.cc | 39 ++++++++++++++++++++++++
 src/operator/nn/softmax.cc               | 25 +++++++++++++--
 3 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 1ce36303689d..5917f59ef04a 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -93,6 +93,10 @@ void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
 void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                           const NDArray &in_data, const OpReqType &req,
                           const NDArray &out_data);
+void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &out_data);
 
 /* For softmax_output */
 void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
index 1235f3c121fc..59a8dd31443f 100644
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -42,6 +42,18 @@ static mkldnn::softmax_forward::primitive_desc GetSoftmaxFwdPd(bool is_train,
   return mkldnn::softmax_forward::primitive_desc(desc, cpu_engine);
 }
 
+static mkldnn::softmax_backward::primitive_desc GetSoftmaxBwdPd(
+                                const mkldnn::memory &diff_mem,
+                                const mkldnn::memory &data_mem,
+                                const int axis,
+                                const mkldnn::softmax_forward::primitive_desc &hint_fwd_pd) {
+  mkldnn::memory::desc diff_md = diff_mem.get_desc();
+  mkldnn::memory::desc data_md = data_mem.get_desc();
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  auto desc = mkldnn::softmax_backward::desc(diff_md, data_md, axis);
+  return mkldnn::softmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
+}
+
 
 bool SupportMKLDNNSoftmax(const SoftmaxParam &param,
                           const NDArray &data,
@@ -131,6 +143,33 @@ void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs,
   stream->Submit();
 }
 
+void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                          const OpContext &ctx,
+                          const std::vector<NDArray> &in_data,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray> &out_data) {
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(in_data.size(), 2U);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis = CheckAxis(param.axis, in_data[1].shape().ndim());
+  auto diff_mem = in_data[0].GetMKLDNNData();
+  auto data_mem = in_data[1].GetMKLDNNData();
+  auto fwd_pd = GetSoftmaxFwdPd(ctx.is_train, axis, *data_mem);
+  auto bwd_pd = GetSoftmaxBwdPd(*diff_mem, *data_mem, axis, fwd_pd);
+
+  auto out_mem = CreateMKLDNNMem(out_data[0], bwd_pd.diff_src_desc(), req[0]);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  mkldnn_args_map_t args = {
+    { MKLDNN_ARG_DST, *data_mem },
+    { MKLDNN_ARG_DIFF_DST, *diff_mem },
+    { MKLDNN_ARG_DIFF_SRC, *out_mem.second },
+  };
+
+  stream->RegisterPrimArgs(bwd_pd, args);
+  CommitOutput(out_data[0], out_mem);
+  stream->Submit();
+}
+
 }   // namespace op
 }   // namespace mxnet
 #endif
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 57edab7037d5..4afd6e45e371 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -54,13 +54,30 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
                   inputs, req, outputs);
 }
 
+static void SoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  // It seems MKLDNN softmax doesn't support training.
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  if (SupportMKLDNNSoftmax(param, inputs[1], outputs[0])) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNRun(MKLDNNSoftmaxBackward, attrs, ctx, inputs, req, outputs);
+    auto fn = SoftmaxGradCompute<cpu, op::mshadow_op::mul, mxnet_op::softmax_bwd>;
+    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(SoftmaxGradCompute<cpu, op::mshadow_op::mul, mxnet_op::softmax_bwd>, attrs, ctx,
+                  inputs, req, outputs);
+}
+
 inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
                                       const int dev_mask,
                                       DispatchMode* dispatch_mode,
                                       std::vector<int> *in_attrs,
                                       std::vector<int> *out_attrs) {
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), (param.use_length.value()) ? 2U : 1U);
   CHECK_EQ(out_attrs->size(), 1U);
 
   if (param.use_length.value()) {
@@ -147,8 +164,12 @@ NNVM_REGISTER_OP(_backward_softmax)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
 .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
 .set_attr_parser(ParamParser<SoftmaxParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxGradComputeExCPU)
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
-
 }  // namespace op
 }  // namespace mxnet

From 98efa1eea926b853ebe7cbc26e8e85ed3d6a641c Mon Sep 17 00:00:00 2001
From: rongzha1 <rong.a.zhang@intel.com>
Date: Thu, 2 Jan 2020 14:41:05 +0800
Subject: [PATCH 2/4] add primitive cache for softmax bwd

---
 src/operator/nn/mkldnn/mkldnn_ops-inl.h  |  6 +--
 src/operator/nn/mkldnn/mkldnn_softmax.cc | 65 ++++++++++++++++++++----
 src/operator/nn/softmax.cc               | 23 +++++++--
 3 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 5917f59ef04a..c862607372a9 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -94,9 +94,9 @@ void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                           const NDArray &in_data, const OpReqType &req,
                           const NDArray &out_data);
 void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const std::vector<NDArray> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<NDArray> &out_data);
+                           const std::vector<NDArray> &in_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &out_data);
 
 /* For softmax_output */
 void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
index 59a8dd31443f..44429e323cf2 100644
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -143,29 +143,76 @@ void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs,
   stream->Submit();
 }
 
+class MKLDNNSoftmaxBwd {
+ public:
+  mkldnn::softmax_backward::primitive_desc pd;
+
+  MKLDNNSoftmaxBwd(const mkldnn::memory &diff_mem,
+                   const mkldnn::memory &data_mem,
+                   const int axis,
+                   const mkldnn::softmax_forward::primitive_desc &hint_fwd_pd) :
+                                 pd(GetSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
+    bwd_ = std::make_shared<mkldnn::softmax_backward>(pd);
+  }
+
+  const mkldnn::softmax_backward &GetBwd() const {
+    return *bwd_;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::softmax_backward> bwd_;
+};
+
+typedef ParamOpSign<SoftmaxParam> MKLDNNSoftmaxSignature;
+
+static MKLDNNSoftmaxBwd &GetSoftmaxBwd(const SoftmaxParam &param,
+                                       const int real_axis,
+                                       const std::vector<NDArray> &data,
+                                       const std::vector<NDArray> &output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxBwd, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxSignature, MKLDNNSoftmaxBwd, OpHash> bwds;
+#endif
+
+  MKLDNNSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    auto diff_mem = data[0].GetMKLDNNData();
+    auto data_mem = data[1].GetMKLDNNData();
+    auto fwd_pd = GetSoftmaxFwdPd(true, real_axis, *data_mem);
+    MKLDNNSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
 void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs,
-                          const OpContext &ctx,
-                          const std::vector<NDArray> &in_data,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray> &out_data) {
+                           const OpContext &ctx,
+                           const std::vector<NDArray> &in_data,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray> &out_data) {
   if (req[0] == kNullOp) return;
   CHECK_EQ(in_data.size(), 2U);
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, in_data[1].shape().ndim());
   auto diff_mem = in_data[0].GetMKLDNNData();
   auto data_mem = in_data[1].GetMKLDNNData();
-  auto fwd_pd = GetSoftmaxFwdPd(ctx.is_train, axis, *data_mem);
-  auto bwd_pd = GetSoftmaxBwdPd(*diff_mem, *data_mem, axis, fwd_pd);
+  auto bwd = GetSoftmaxBwd(param, axis, in_data, out_data);
 
-  auto out_mem = CreateMKLDNNMem(out_data[0], bwd_pd.diff_src_desc(), req[0]);
+  auto out_mem = CreateMKLDNNMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
   MKLDNNStream *stream = MKLDNNStream::Get();
   mkldnn_args_map_t args = {
     { MKLDNN_ARG_DST, *data_mem },
     { MKLDNN_ARG_DIFF_DST, *diff_mem },
-    { MKLDNN_ARG_DIFF_SRC, *out_mem.second },
+    { MKLDNN_ARG_DIFF_SRC, *out_mem.second }
   };
 
-  stream->RegisterPrimArgs(bwd_pd, args);
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
   CommitOutput(out_data[0], out_mem);
   stream->Submit();
 }
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 4afd6e45e371..37c91e0e48fa 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -41,7 +41,6 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  // It seems MKLDNN softmax doesn't support training.
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   if (SupportMKLDNNSoftmax(param, inputs[0], outputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
@@ -59,7 +58,6 @@ static void SoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  // It seems MKLDNN softmax doesn't support training.
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   if (SupportMKLDNNSoftmax(param, inputs[1], outputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
@@ -78,6 +76,25 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int> *in_attrs,
                                       std::vector<int> *out_attrs) {
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), (param.use_length.value()) ? 2U : 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  if (param.use_length.value()) {
+    auto& out_stype = out_attrs->at(0);
+    return storage_type_assign(&out_stype, kDefaultStorage,
+                               dispatch_mode, DispatchMode::kFCompute);
+  }
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
+}
+
+inline static bool SoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
+                                      const int dev_mask,
+                                      DispatchMode* dispatch_mode,
+                                      std::vector<int> *in_attrs,
+                                      std::vector<int> *out_attrs) {
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   CHECK_EQ(out_attrs->size(), 1U);
 
   if (param.use_length.value()) {
@@ -167,7 +184,7 @@ NNVM_REGISTER_OP(_backward_softmax)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxGradComputeExCPU)
-.set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxGradStorageType)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);

From 7a76d7e510039e247a63eeb2fc498501989ada32 Mon Sep 17 00:00:00 2001
From: rongzha1 <rong.a.zhang@intel.com>
Date: Tue, 7 Jan 2020 13:48:38 +0800
Subject: [PATCH 3/4] fix preci failed test

---
 src/operator/nn/softmax.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 37c91e0e48fa..97949ffbc81e 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -95,14 +95,13 @@ inline static bool SoftmaxGradStorageType(const nnvm::NodeAttrs& attrs,
                                       std::vector<int> *in_attrs,
                                       std::vector<int> *out_attrs) {
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  if (param.use_length.value()) {
+  if (param.use_length.value() || softmax_has_dtype_override(attrs)) {
     auto& out_stype = out_attrs->at(0);
     return storage_type_assign(&out_stype, kDefaultStorage,
                                dispatch_mode, DispatchMode::kFCompute);
   }
-
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
   return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
                            out_attrs);
 }

From 05302b06224ba0c6d91b4957d752c529deb88a3e Mon Sep 17 00:00:00 2001
From: rongzha1 <rong.a.zhang@intel.com>
Date: Wed, 15 Jan 2020 15:01:02 +0800
Subject: [PATCH 4/4] rm duplicate line

---
 src/operator/nn/mkldnn/mkldnn_softmax.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
index 44429e323cf2..e96ab6c20ca3 100644
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -163,8 +163,6 @@ class MKLDNNSoftmaxBwd {
   std::shared_ptr<mkldnn::softmax_backward> bwd_;
 };
 
-typedef ParamOpSign<SoftmaxParam> MKLDNNSoftmaxSignature;
-
 static MKLDNNSoftmaxBwd &GetSoftmaxBwd(const SoftmaxParam &param,
                                        const int real_axis,
                                        const std::vector<NDArray> &data,
@@ -194,7 +192,7 @@ static MKLDNNSoftmaxBwd &GetSoftmaxBwd(const SoftmaxParam &param,
 void MKLDNNSoftmaxBackward(const nnvm::NodeAttrs& attrs,
                            const OpContext &ctx,
                            const std::vector<NDArray> &in_data,
-                           const std::vector<OpReqType>& req,
+                           const std::vector<OpReqType> &req,
                            const std::vector<NDArray> &out_data) {
   if (req[0] == kNullOp) return;
   CHECK_EQ(in_data.size(), 2U);