From f69b4a055447a5be30385d69acfa1403961e00eb Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Fri, 18 Sep 2020 20:18:50 -0700
Subject: [PATCH 01/30] [1.x][FEATURE] CUDA graphs support (#19142)

* Initial cherry-pick

* Store NodeAttrs in OpExecutor

* Do not allow stateful operations in CUDA graphs and provide mechanism
for marking ops as safe

* Guard against using ops with synchronization

* Cleaning

* Properly guard graphs

* Limit graphs to CUDA 10.2+

* Fix the compilation when graphs are not available

* Guarding the libcuda.so usage behind RTC compilation flag

* Document the env variables

* Add test

* Fix the test

* Use with_environment
---
 docs/static_site/src/pages/api/faq/env_var.md |  10 +
 include/mxnet/op_attr_types.h                 |  13 +
 src/imperative/attach_op_execs_pass.cc        |  57 +-
 src/imperative/cuda_graphs.h                  | 547 ++++++++++++++++++
 src/imperative/exec_pass.h                    |  18 +
 src/imperative/imperative_utils.h             |  21 +-
 src/operator/contrib/adamw.cu                 |  17 +
 src/operator/nn/dropout.cu                    |   8 +-
 src/operator/numpy/linalg/np_eig.cu           |  14 +-
 src/operator/numpy/linalg/np_eigvals.cu       |  14 +-
 src/operator/numpy/linalg/np_norm_backward.cu |   5 +
 src/operator/numpy/linalg/np_norm_forward.cu  |   8 +-
 src/operator/numpy/np_boolean_mask_assign.cu  |   8 +
 src/operator/numpy/np_constraint_check.cu     |   4 +
 src/operator/numpy/np_percentile_op.cu        |   7 +-
 src/operator/numpy/random/np_bernoulli_op.cu  |   7 +-
 .../numpy/random/np_exponential_op.cu         |   8 +
 src/operator/numpy/random/np_gamma_op.cu      |   7 +-
 .../numpy/random/np_multinomial_op.cu         |   4 +
 src/operator/numpy/random/np_normal_op.cu     |  14 +-
 src/operator/numpy/random/np_pareto_op.cu     |   7 +-
 src/operator/numpy/random/np_power_op.cu      |   7 +-
 src/operator/numpy/random/np_rayleigh_op.cu   |   7 +-
 src/operator/numpy/random/np_weibull_op.cu    |   7 +-
 src/operator/tensor/indexing_op.cu            |   7 +-
 tests/python/gpu/test_gluon_gpu.py            | 110 +++-
 26 files changed, 892 insertions(+), 44 deletions(-)
 create mode 100644 src/imperative/cuda_graphs.h
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index dad481cfbf3f..1a4421d2e50f 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -170,6 +170,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
   - Values: Int ```(default=<value of MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN>)```
   - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
+* MXNET_ENABLE_CUDA_GRAPHS
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible.
+  - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True.
+* MXNET_CUDA_GRAPHS_VERBOSE
+  - Values: 0(false) or  1(true) ```(default=0)```
+  - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed.
+* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
+  - Values: Int ```(default=0)```
+  - The maximum number of log messages generated by CUDA graphs executor.
 
 ## Control the Data Communication
 
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 2fec1768ea86..73504bb2748b 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -357,6 +357,19 @@ using FNeedCalibrateInput = std::function<std::vector<int>(const NodeAttrs& attr
  */
 using FNeedCalibrateOutput = std::function<std::vector<int>(const NodeAttrs& attrs)>;
 
+#if MXNET_USE_CUDA
+
+/*!
+ * \brief Register a function to determine if
+ * the operator implementation is compatible
+ * with CUDA graphs. This requires the execution
+ * to stay the same as long as the shape and type
+ * of input stays the same.
+ */
+using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;
+
+#endif
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
index 4a8c51d107c7..719c9b3165fe 100644
--- a/src/imperative/attach_op_execs_pass.cc
+++ b/src/imperative/attach_op_execs_pass.cc
@@ -47,8 +47,10 @@ namespace exec {
 // FComputeExecutor and FStatefulComputeExecutor inherit from this class
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  explicit StorageFallbackOpExecutor(std::vector<uint32_t> mutate_idx)
-      : mutate_idx_(std::move(mutate_idx)) {}
+  explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
+                            DispatchMode dispatch_mode,
+                            std::vector<uint32_t> mutate_idx)
+      : OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {}
 
   void Setup() override {
     init_ = false;
@@ -146,11 +148,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(OpStatePtr state,
+  explicit StatefulComputeExecutor(const NodeAttrs& attrs,
+                                   DispatchMode dispatch_mode,
+                                   OpStatePtr state,
                                    FStatefulCompute fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t>& mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         state_(std::move(state)),
         fcompute_(std::move(fcompute)),
         exec_type_(exec_type) {}
@@ -168,7 +172,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs);
     fcompute_(state_, op_ctx, *pInArray, req, out_array);
   }
 
@@ -186,17 +190,17 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExExecutor(NodeAttrs attrs,
+  explicit StatefulComputeExExecutor(const NodeAttrs& attrs,
+                                     DispatchMode dispatch_mode,
                                      OpStatePtr state,
                                      FStatefulComputeEx fcompute,
                                      ExecType exec_type)
-      : attrs_(std::move(attrs)),
+      : OpExecutor(attrs, dispatch_mode),
         state_(std::move(state)),
         fcompute_(std::move(fcompute)),
         exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -210,7 +214,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     PreFCompute(is_gpu);
-    fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    fcompute_(attrs, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
   }
 
@@ -218,17 +222,16 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(NodeAttrs attrs,
+  explicit FComputeExecutor(const NodeAttrs& attrs,
+                            DispatchMode dispatch_mode,
                             FCompute fcompute,
                             ExecType exec_type,
                             const std::vector<uint32_t>& mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
-        attrs_(std::move(attrs)),
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         fcompute_(std::move(fcompute)),
         exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   FCompute fcompute_;
   ExecType exec_type_;
 };
@@ -240,8 +243,8 @@ class FComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
-    fcompute_(attrs_, op_ctx, *pInArray, req, out_array);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs);
+    fcompute_(attrs, op_ctx, *pInArray, req, out_array);
   }
 
   void Setup() override {}
@@ -250,11 +253,12 @@ class FComputeExExecutor : public OpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExExecutor(NodeAttrs attrs, FComputeEx fcompute, ExecType exec_type)
+  explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
+                              FComputeEx fcompute, ExecType exec_type)
+      : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
       : attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   FComputeEx fcompute_;
   ExecType exec_type_;
 };
@@ -309,14 +313,15 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, state, fcompute_ex, exec_type);
+          inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute =
           common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute, exec_type, mutate_index);
+      ret[i] = std::make_shared<StatefulComputeExecutor>(
+          inode.source->attrs, dispatch_modes[i], state, fcompute, exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
     CHECK_GE(inode.control_deps.size(), 1);
@@ -327,25 +332,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         common::GetFCompute<FStatefulComputeEx>(op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex, exec_type);
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute =
           common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
+          mutate_index);
     }
   } else {
     FCompute fcompute   = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
     FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
     if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<FComputeExExecutor>(inode.source->attrs, fcomp_ex, exec_type);
+      ret[i] = std::make_shared<FComputeExExecutor>(
+          inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type);
     } else if (fcompute != nullptr) {
       ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+          inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index);
     } else {
       LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
     }
diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
new file mode 100644
index 000000000000..e669d7d1d2e3
--- /dev/null
+++ b/src/imperative/cuda_graphs.h
@@ -0,0 +1,547 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file cuda_graphs.h
+ * \brief Wrappers for use of CUDA Graphs API
+ */
+#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_
+#define MXNET_EXECUTOR_CUDA_GRAPHS_H_
+
+#include <mxnet/base.h>
+#include <vector>
+#include <string>
+#include <map>
+#include <set>
+
+#include "./exec_pass.h"
+#include "../common/cuda_utils.h"
+
+#if MXNET_USE_CUDA
+#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020)
+#else
+#define CUDA_GRAPHS_AVAILABLE (0)
+#endif
+
+#if CUDA_GRAPHS_AVAILABLE
+
+namespace mxnet {
+namespace cuda_graphs {
+
+inline std::string CudaDim3ToString(const dim3& dims) {
+  std::stringstream ss;
+  if (dims.z != 1)
+    ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")";
+  else if (dims.y != 1)
+    ss << "(" << dims.x << "," << dims.y << ")";
+  else
+    ss << "(" << dims.x << ")";
+  return ss.str();
+}
+
+// Return the list of CUDA Graph nodes from a graph
+inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
+  size_t numNodes;
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast<cudaGraphNode_t*>(nullptr), &numNodes));
+  if (numNodes == 0)
+    return std::vector<cudaGraphNode_t>();
+  std::vector<cudaGraphNode_t> graphNodes(numNodes);
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes));
+  return graphNodes;
+}
+
+// It does not really involve RTC, but requires libcuda.so,
+// which is linked only when RTC is enabled.
+#if MXNET_ENABLE_CUDA_RTC
+
+inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
+  std::stringstream ss;
+
+  // The following introspection calls are made through the driver API in order to bypass
+  // problems that would arise if multiple statically-linked copies of the runtime exist.
+
+  CUgraphNode cu_node = node;
+  CUgraphNodeType t;
+  CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
+  switch (t) {
+    case CU_GRAPH_NODE_TYPE_KERNEL:
+      {
+        CUDA_KERNEL_NODE_PARAMS kparams;
+        auto err = cuGraphKernelNodeGetParams(cu_node, &kparams);
+        if (err == CUDA_SUCCESS) {
+          ss << "GPUKernel@" << kparams.func;
+          dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ);
+          dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ);
+          ss << "<<<gridDim=" << CudaDim3ToString(gridDim)
+             << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>";
+          ss << "(...";
+          if (kparams.sharedMemBytes != 0)
+            ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes;
+          ss << ")";
+        } else {
+          ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err;
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMCPY:
+      {
+        cudaMemcpy3DParms mparams = {};
+        CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams));
+        // If memcpy is seen, return without setting up runnable executor
+        switch (mparams.kind) {
+          case cudaMemcpyHostToHost: ss << "Host->Host "; break;
+          case cudaMemcpyHostToDevice: ss << "Host->Device "; break;
+          case cudaMemcpyDeviceToHost: ss << "Device->Host "; break;
+          case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break;
+          default: break;
+        }
+        ss << "Memcpy";
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMSET:
+      {
+        cudaMemsetParams mparams = {};
+        CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams));
+        if (mparams.height == 1 && mparams.elementSize == 1) {
+          ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value
+             << ", count=" << mparams.width << ")";
+        } else {
+          if (mparams.elementSize == 1)
+            ss << "cudaMemset2D";
+          else
+            ss << "MemSet<elemBytes=" << mparams.elementSize << ">";
+          ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch
+             << ", value=" << mparams.value << ", width=" << mparams.width
+             << ", height=" << mparams.height << ")";
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break;
+    case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break;
+    case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break;
+    default: ss << "Unknown/Invalid node type " << t;
+  }
+  return ss.str();
+}
+
+#endif  // MXNET_ENABLE_CUDA_RTC
+
+// CUDA Graphs are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphDeleter {
+ public:
+  void operator() (cudaGraph_t graph) {
+    if (graph != nullptr)
+      CUDA_CALL(cudaGraphDestroy(graph));
+  }
+};
+
+// CUDA Graphs Executors are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphExecDeleter {
+ public:
+  void operator() (cudaGraphExec_t graph_exec) {
+    if (graph_exec != nullptr)
+      CUDA_CALL(cudaGraphExecDestroy(graph_exec));
+  }
+};
+
+// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'),
+// characterized by a starting index in the OpExecutor list and a number of ops.
+class CudaGraphsSubSegExec {
+ public:
+  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops,
+                       bool ops_are_cuda_graph_compatible = true) :
+  from_op_idx_(from_op_idx),
+  num_ops_(num_ops),
+  graph_(nullptr),
+  graph_exec_(nullptr) {
+    if (ops_are_cuda_graph_compatible) {
+      MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
+      MakeGraphExec();
+    }
+  }
+
+  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu,
+              bool verbose) {
+    // Current executor should be Runnable with the same parameters
+    CHECK(IsRunnable());
+    MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_);
+
+    cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
+    cudaGraphNode_t error_node;
+    CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
+                                  &error_node, &update_result));
+    // If update fails make a new executor, discarding old one.
+    if (update_result != cudaGraphExecUpdateSuccess)
+      MakeGraphExec();
+  }
+
+  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 const RunContext &rctx,
+                 bool is_gpu) {
+    if (IsRunnable()) {
+      auto s = rctx.get_stream<gpu>();
+      const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+      CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s));
+    } else {
+      // No CUDA Graph could be made for this portion of the OpSegment.  Run conventionally.
+      for (int i = 0; i != num_ops_; ++i)
+        exec_list[from_op_idx_ + i]->Run(rctx, is_gpu);
+    }
+  }
+
+  bool IsRunnable() { return graph_exec_ != nullptr; }
+
+ private:
+  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops) {
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // Create CUDA Graph
+    // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers
+    // to sync their streams without disturbing this capture.
+    CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal));
+    // Run those oprs in the sub segment while capturing- no actual GPU work is launched.
+    for (int i = 0; i != num_ops; ++i)
+      exec_list[from_op_idx + i]->Run(rctx, is_gpu);
+    cudaGraph_t cuda_graph = nullptr;
+    CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph));
+    graph_.reset(cuda_graph, CudaGraphDeleter());
+
+    if (verbose) {
+      std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
+      size_t num_nodes = graph_nodes.size();
+      LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
+#if MXNET_ENABLE_CUDA_RTC
+      for (size_t i = 0; i != num_nodes; ++i) {
+        LOG(INFO) << "    node " << i << " = "
+                  << CudaGraphNodeToString(graph_nodes[i]);
+      }
+#endif  // MXNET_ENABLE_CUDA_RTC
+    }
+  }
+
+  void MakeGraphExec() {
+      cudaGraphExec_t cuda_graph_exec;
+      cudaGraphNode_t error_node;
+      char log_buffer[1000];
+
+      CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(),
+                                     &error_node, log_buffer, 1000));
+      graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
+
+      // At this point we have a CUDA Graph executor
+      static int num_graph_creations_logged = 0;
+      static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
+      if (num_graph_creations_logged < max_log_entries) {
+        num_graph_creations_logged++;
+        LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
+        if (num_graph_creations_logged == max_log_entries)
+          LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
+      }
+  }
+
+  int from_op_idx_;
+  int num_ops_;
+  using cudaGraphStruct_t = typename std::remove_pointer<cudaGraph_t>::type;
+  using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type;
+  std::shared_ptr<cudaGraphStruct_t> graph_;
+  std::shared_ptr<cudaGraphExecStruct_t> graph_exec_;
+};
+
+// The CudaGraph executor and associated Tempspace ptrs for which it is valid.
+struct CudaGraphInfo {
+  std::vector<CudaGraphsSubSegExec> cuda_graph_subseg_execs;
+  bool has_been_run_conventionally = false;
+  std::vector<void *> tempspace_dptrs;
+};
+// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and
+// the state of the is_train flag of the OpContext.  If the tempspace_dptrs change, we
+// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph.
+struct CudaGraphCacheKey {
+  cudaStream_t cu_s;
+  bool is_train;
+  // overload '<' so CudaGraphCacheKey can be used as a std::map key
+  bool operator<(const CudaGraphCacheKey &other) const {
+    return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train);
+  }
+};
+using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>;
+
+class CudaGraphsExec {
+ public:
+  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 bool is_gpu,
+                 const char *opr_names) :
+  verbose_(false), is_enabled_(false) {
+    opr_names_ = opr_names ? std::string(opr_names) : std::string();
+    if (is_gpu) {
+      is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false);
+      verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false);
+      SetTempSpaces(exec_list);
+    }
+  }
+
+  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu) {
+    // If this a CPU op or CUDA Graphs use isn't possible, run normally and return
+    if (!is_gpu || !is_enabled_) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      return;
+    }
+
+    // Also if we're in a warm-up period where tempspace pointers are likely
+    // to change, run normally and return
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // All the ops in the bulked segment will have the same setting of is_train as the first op
+    const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
+    const CudaGraphCacheKey key = {cu_s, is_train};
+    // Look-up the CUDA Graph info for this combo of stream and is_train setting
+    // This may create a default-initialized new entry.
+    auto &cuda_graph_info = cache_[key];
+    if (!cuda_graph_info.has_been_run_conventionally) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      cuda_graph_info.has_been_run_conventionally = true;
+      return;
+    }
+
+    // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors'
+    //     (there might be more than one executor if some ops in the segment are not capturable)
+    auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s);
+
+    // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'.
+    if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 &&
+        cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) {
+      // Update all runnable executors.  Non-runnable executors launch their ops conventionally.
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          subseg_exec.Update(exec_list, rctx, is_gpu, verbose_);
+      }
+    } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) {
+      // No executors exist yet, so create them.
+      if (verbose_)
+        LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_;
+      // Make one or more CUDA Graphs, avoiding ops that are not compatible.
+      for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) {
+          if (OpOK(exec_list[last_op_idx]))
+            num_good_ops++;
+          else
+            break;
+        }
+        if (num_good_ops > 0) {
+          CreateSubExecOverRegion(exec_list, rctx, is_gpu,
+                                  first_op_idx,
+                                  first_op_idx + num_good_ops,
+                                  &cuda_graph_info.cuda_graph_subseg_execs);
+          first_op_idx += num_good_ops;
+        }
+        if (first_op_idx != exec_list.size()) {
+          // We had to have hit an op that was not OK.
+          if (verbose_) {
+            LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]"
+                      << " of op segment "  << opr_names_;
+          }
+          CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false);
+          cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg);
+          first_op_idx++;
+        }
+      }
+      // During graph capture, the ops may be asking for the tempworkspace.  This should
+      // not alter the base pointers, since this op seg has been executed before on this
+      // stream (i.e. on this gpu worker).  Safest to double-check this though.
+      auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s);
+      if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs)
+        LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use.";
+      cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs;
+    }
+    // Now execute the CUDA Graph that we either just created or looked-up in the cache.
+    if (verbose_) {
+      int runnable_execs = 0;
+      int bypassed_ops = 0;
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          runnable_execs++;
+        else
+          bypassed_ops++;
+      }
+      LOG(INFO) << "Launching " << runnable_execs
+                << " captured CUDA graph(s) for op segment " << opr_names_;
+      if (bypassed_ops > 0)
+        LOG(INFO) << "    (bypassing " << bypassed_ops << " un-capturable ops)";
+    }
+    for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs)
+      subseg_exec.RunSubSeg(exec_list, rctx, is_gpu);
+  }
+
+ private:
+  // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx).  If such a graph
+  // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a
+  // number of smaller graphs that avoid those ops with the memcpys.
+  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                               const RunContext &rctx,
+                               bool is_gpu,
+                               size_t from_op_idx,
+                               size_t upto_op_idx,
+                               std::vector<CudaGraphsSubSegExec> *cuda_graph_subseg_execs) {
+    // Optimistically try to create a CUDA Graph of the entire op segment region
+
+    int num_ops = upto_op_idx - from_op_idx;
+    CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops);
+    if (full_opseg.IsRunnable()) {
+      cuda_graph_subseg_execs->push_back(full_opseg);
+    } else {
+      if (verbose_)
+        LOG(INFO) << "  Graph was not runnable- creating op sub-segments...";
+      // Enter fall-back approach to making many sub-execs
+      for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) {
+          CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1);
+          if (single_opseg.IsRunnable())
+            num_good_ops++;
+          // Is it time to create a subseg exec from accumulated good ops?
+          if (num_good_ops > 0 &&
+              (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) {
+            if (verbose_)
+              LOG(INFO) << "Capturing CUDA graph of op sub segment["
+                        << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]"
+                        << " of op segment "  << opr_names_;
+            CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_,
+                                            first_op_idx, num_good_ops);
+            CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation";
+            cuda_graph_subseg_execs->push_back(good_opseg);
+            first_op_idx += num_good_ops;
+          }
+          // If the last single op was not runnable, use the exec to handle that op conventionally
+          if (!single_opseg.IsRunnable()) {
+            if (verbose_) {
+              LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]"
+                        << " of op segment "  << opr_names_;
+              // Generate throw-away exec in order to produce a diagnostic listing of graph nodes
+              CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1);
+            }
+            cuda_graph_subseg_execs->push_back(single_opseg);
+            first_op_idx++;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Is the Op OK to make part of a CUDA Graph?
+  bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
+    static auto& fstateful = Op::GetAttr<FCreateOpState>("FCreateOpState");
+    static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    static auto& fcompute_ex = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
+    const auto& attrs = exec->attrs;
+    if (attrs.op != nullptr) {
+      const auto f = fgraphcompatible.get(attrs.op, nullptr);
+      if (f != nullptr) {
+        return f(attrs, exec->op_ctx.is_train);
+      }
+      if (fstateful.get(attrs.op, nullptr) != nullptr) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting stateful operator " << attrs.op->name  << " from CUDA graph.";
+        }
+        return false;
+      }
+      if ((fcompute_ex.get(attrs.op, nullptr) != nullptr &&
+           exec->dispatch_mode == DispatchMode::kFComputeEx) ||
+          exec->dispatch_mode == DispatchMode::kFComputeFallback) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator " << attrs.op->name
+                    << " from CUDA graph due to dispatch mode "
+                    << static_cast<int>(exec->dispatch_mode);
+        }
+        return false;
+      }
+    }
+    for (auto& resource : exec->op_ctx.requested) {
+      if (!(resource.req.type == ResourceRequest::kTempSpace)) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator " << attrs.op->name
+                    << " from CUDA graph due to using the resource type "
+                    << static_cast<int>(resource.req.type);
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Determine Tempspaces used by ops.  Other resource uses disable CUDA Graphs.
+  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list) {
+    // Gather info about the ops use of TempSpace.
+    if (is_enabled_) {
+      std::set<Resource *> tempspaces_set;
+      for (auto& exec : exec_list) {
+        for (auto& resource : exec->op_ctx.requested) {
+          if (resource.req.type == ResourceRequest::kTempSpace) {
+            tempspaces_set.insert(&resource);
+          }
+        }
+      }
+      tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end());
+    }
+  }
+
+  // Return the addresses of the gpu TempSpace areas
+  std::vector<void *> GetGPUTempspacePtrs(mshadow::Stream<gpu> *s) {
+    std::vector<void *> ret;
+    for (const auto& resource : tempspaces_) {
+      // Ask for minimal allocation to get base pointer without increasing the size
+      auto *base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_;
+      ret.push_back(static_cast<void *>(base_ptr));
+    }
+    return ret;
+  }
+
+  CudaGraphCache cache_;
+  std::vector<Resource *> tempspaces_;
+  std::string opr_names_;
+  bool verbose_;
+  bool is_enabled_;
+};
+
+}  // namespace cuda_graphs
+}  // namespace mxnet
+
+#endif  // CUDA_GRAPHS_AVAILABLE
+
+#endif  // MXNET_EXECUTOR_CUDA_GRAPHS_H_
diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h
index 7667d97632fc..6da6c8d95c69 100644
--- a/src/imperative/exec_pass.h
+++ b/src/imperative/exec_pass.h
@@ -30,6 +30,7 @@
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph.h>
 #include <nnvm/graph_attr_types.h>
+#include <utility>
 #include <vector>
 #include <memory>
 #include <string>
@@ -84,6 +85,13 @@ class OpExecutor {
   std::vector<OpReqType> req;
   /*! \brief runtime op context, contains allocated resources */
   OpContext op_ctx;
+  /*! \brief attributes of the node */
+  NodeAttrs attrs;
+  /*! \brief dispatch mode of the executor */
+  DispatchMode dispatch_mode;
+
+  explicit OpExecutor(NodeAttrs  attrs, DispatchMode dispatch_mode) :
+    attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {}
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
   /*!
@@ -98,6 +106,16 @@ class OpExecutor {
    * \param rctx The runtime context passed in by environment.
    */
   virtual void Run(RunContext rctx, bool is_gpu) = 0;
+  /*!
+   * \brief run the operators of a vector of execs, given runtime context on device.
+   *  This function call does not synchronize the stream.
+   * \param rctx The runtime context passed in by environment.
+   */
+  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> > &execs,
+                     RunContext rctx, bool is_gpu) {
+    for (auto &exec : execs)
+      exec->Run(rctx, is_gpu);
+  }
   /*! \return the execution type */
   virtual ExecType exec_type() const = 0;
   /*! \return return engine variable for operator states */
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index ce1a60fb2b20..bd9fe783ab28 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -27,6 +27,8 @@
 #include <utility>
 #include <vector>
 
+#include "./exec_pass.h"
+#include "./cuda_graphs.h"
 #include "../c_api/c_api_common.h"
 #include "../common/exec_utils.h"
 #include "../common/utils.h"
@@ -1248,6 +1250,21 @@ inline Engine::OprHandle CreateEngineOp(
   bool is_gpu   = default_ctx.dev_mask() == gpu::kDevMask;
   bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync;
 
+#if CUDA_GRAPHS_AVAILABLE
+  // Provide initialized `cuda_graphs_exec`, which when captured
+  // by exec_fun, acts like a static variable inside the mutable closure.
+  cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names);
+  auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] (
+      RunContext ctx,
+      Engine::CallbackOnStart on_start,
+      Engine::CallbackOnComplete on_complete) mutable {
+    on_start();
+    if (is_async) {
+      execs[0]->op_ctx.async_on_complete = on_complete;
+    }
+    // Run all opr in the sub-graph with CUDA graphs executor if possible
+    cuda_graphs_exec.RunAll(execs, ctx, is_gpu);
+#else
   auto exec_fun = [execs, is_async, is_gpu](RunContext ctx,
                                             Engine::CallbackOnStart on_start,
                                             Engine::CallbackOnComplete on_complete) {
@@ -1255,8 +1272,8 @@ inline Engine::OprHandle CreateEngineOp(
     if (is_async) {
       execs[0]->op_ctx.async_on_complete = on_complete;
     }
-    for (const auto& exec : execs)
-      exec->Run(ctx, is_gpu);
+    exec::OpExecutor::RunAll(execs, ctx, is_gpu);
+#endif
     // call on complete only if it is async op
     if (!is_async) {
       if (is_gpu) {
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index 802378839bc2..33fca6ccea67 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -45,17 +45,34 @@ void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float*
         })}
 
 NNVM_REGISTER_OP(_adamw_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", adamw::MPUpdate<gpu, AdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_mp_adamw_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", adamw::MPUpdate<gpu, MPAdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_multi_adamw_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", adamw::multiMPUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_adamw_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", adamw::multiMPUpdate<gpu, true>);
 
+
 }  // namespace adamw
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index d6c97f5f09fc..bff9b020126c 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -28,7 +28,13 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(Dropout).set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
+NNVM_REGISTER_OP(Dropout)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs&, const bool is_train) {
+        // Dropout is just passthrough during inference
+        return !is_train;
+        })
+    .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)
     .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutGradCompute<gpu>);
diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu
index 1f89106bab47..ab5c6644501d 100644
--- a/src/operator/numpy/linalg/np_eig.cu
+++ b/src/operator/numpy/linalg/np_eig.cu
@@ -28,11 +28,21 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_eig).set_attr<FCompute>("FCompute<gpu>", EigOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_eig)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", EigOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
-NNVM_REGISTER_OP(_npi_eigh).set_attr<FCompute>("FCompute<gpu>", EighOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_eigh)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", EighOpForward<gpu>);
 
 #endif
 
diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu
index dc03805c54d0..94a007d7a245 100644
--- a/src/operator/numpy/linalg/np_eigvals.cu
+++ b/src/operator/numpy/linalg/np_eigvals.cu
@@ -28,11 +28,21 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_eigvals).set_attr<FCompute>("FCompute<gpu>", EigvalsOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_eigvals)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", EigvalsOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
-NNVM_REGISTER_OP(_npi_eigvalsh).set_attr<FCompute>("FCompute<gpu>", EigvalshOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_eigvalsh)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", EigvalshOpForward<gpu>);
 
 #endif
 
diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu
index 24d8783dba33..e88d717bd580 100644
--- a/src/operator/numpy/linalg/np_norm_backward.cu
+++ b/src/operator/numpy/linalg/np_norm_backward.cu
@@ -26,6 +26,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_backward_npi_norm)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs& attrs, const bool) {
+          const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+          return param.axis.value().ndim() == 2;
+        })
     .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu
index 89267632d898..26a87c580010 100644
--- a/src/operator/numpy/linalg/np_norm_forward.cu
+++ b/src/operator/numpy/linalg/np_norm_forward.cu
@@ -25,7 +25,13 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_norm).set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeForward<gpu>);
+NNVM_REGISTER_OP(_npi_norm)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs& attrs, const bool) {
+          const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+          return param.axis.value().ndim() == 2;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu
index 10f8612a3ffb..8e1bd57386cd 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cu
+++ b/src/operator/numpy/np_boolean_mask_assign.cu
@@ -273,9 +273,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu
index 04a0a36f4043..8d622d152bac 100644
--- a/src/operator/numpy/np_constraint_check.cu
+++ b/src/operator/numpy/np_constraint_check.cu
@@ -38,6 +38,10 @@ void GetReduceOutput<gpu>(mshadow::Stream<gpu>* s, const TBlob& output_blob, boo
 }
 
 NNVM_REGISTER_OP(_npx_constraint_check)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", ConstraintCheckForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu
index 13d076dd9b53..7ff67ad3c9c5 100644
--- a/src/operator/numpy/np_percentile_op.cu
+++ b/src/operator/numpy/np_percentile_op.cu
@@ -52,7 +52,12 @@ bool CheckInvalidInput(mshadow::Stream<gpu>* s,
   return is_valid == 0;
 }
 
-NNVM_REGISTER_OP(_npi_percentile).set_attr<FCompute>("FCompute<gpu>", NumpyPercentileForward<gpu>);
+NNVM_REGISTER_OP(_npi_percentile)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs&, const bool) {
+        return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyPercentileForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu
index 8cdceb5bb4c8..0cf9bd95ab7c 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cu
+++ b/src/operator/numpy/random/np_bernoulli_op.cu
@@ -27,7 +27,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_bernoulli).set_attr<FCompute>("FCompute<gpu>", NumpyBernoulliForward<gpu>);
+NNVM_REGISTER_OP(_npi_bernoulli)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyBernoulliForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu
index 60809fbb91c5..7a2068dad8a1 100644
--- a/src/operator/numpy/random/np_exponential_op.cu
+++ b/src/operator/numpy/random/np_exponential_op.cu
@@ -28,7 +28,15 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_exponential)
+<<<<<<< HEAD
     .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
+=======
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
+.set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
+>>>>>>> f4bcd48dd... [1.x][FEATURE] CUDA graphs support (#19142)
 
 NNVM_REGISTER_OP(_backward_broadcast_exponential)
     .set_attr<FCompute>("FCompute<gpu>", ExponentialReparamBackward<gpu>);
diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu
index 7e3cabc3a83f..8bfc61aad7ab 100644
--- a/src/operator/numpy/random/np_gamma_op.cu
+++ b/src/operator/numpy/random/np_gamma_op.cu
@@ -28,7 +28,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_gamma).set_attr<FCompute>("FCompute<gpu>", NumpyGammaForward<gpu, double>);
+NNVM_REGISTER_OP(_npi_gamma)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyGammaForward<gpu, double>);
 
 NNVM_REGISTER_OP(_backward_gamma_sample).set_attr<FCompute>("FCompute<gpu>", NumpyGammaGrad<gpu>);
 
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
index 083b410a2d8a..ee77b79c6d91 100644
--- a/src/operator/numpy/random/np_multinomial_op.cu
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -41,6 +41,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) {
 }
 
 NNVM_REGISTER_OP(_npi_multinomial)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", NumpyMultinomialForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu
index db8746165c6e..3d310f82b20d 100644
--- a/src/operator/numpy/random/np_normal_op.cu
+++ b/src/operator/numpy/random/np_normal_op.cu
@@ -27,12 +27,22 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_normal).set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
+NNVM_REGISTER_OP(_npi_normal)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_normal)
     .set_attr<FCompute>("FCompute<gpu>", NormalReparamBackward<gpu>);
 
-NNVM_REGISTER_OP(_npi_normal_n).set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
+NNVM_REGISTER_OP(_npi_normal_n)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu
index 7618d2871099..59900a0090e8 100644
--- a/src/operator/numpy/random/np_pareto_op.cu
+++ b/src/operator/numpy/random/np_pareto_op.cu
@@ -27,7 +27,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_pareto).set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
+NNVM_REGISTER_OP(_npi_pareto)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_pareto)
     .set_attr<FCompute>("FCompute<gpu>", ParetoReparamBackward<gpu>);
diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu
index 290442037eee..f5bcb1000771 100644
--- a/src/operator/numpy/random/np_power_op.cu
+++ b/src/operator/numpy/random/np_power_op.cu
@@ -27,7 +27,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_powerd).set_attr<FCompute>("FCompute<gpu>", NumpyPowerForward<gpu>);
+NNVM_REGISTER_OP(_npi_powerd)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyPowerForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu
index 586f17481e30..c17d6c2bdd79 100644
--- a/src/operator/numpy/random/np_rayleigh_op.cu
+++ b/src/operator/numpy/random/np_rayleigh_op.cu
@@ -27,7 +27,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_rayleigh).set_attr<FCompute>("FCompute<gpu>", NumpyRayleighForward<gpu>);
+NNVM_REGISTER_OP(_npi_rayleigh)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyRayleighForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_rayleigh)
     .set_attr<FCompute>("FCompute<gpu>", RayleighReparamBackward<gpu>);
diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu
index 658be16e6333..62c0a564c060 100644
--- a/src/operator/numpy/random/np_weibull_op.cu
+++ b/src/operator/numpy/random/np_weibull_op.cu
@@ -27,7 +27,12 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_weibull).set_attr<FCompute>("FCompute<gpu>", NumpyWeibullForward<gpu>);
+NNVM_REGISTER_OP(_npi_weibull)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyWeibullForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_weibull)
     .set_attr<FCompute>("FCompute<gpu>", WeibullReparamBackward<gpu>);
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 90504301cc22..f4ce8ebcfd8a 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -957,7 +957,12 @@ NNVM_REGISTER_OP(batch_take).set_attr<FCompute>("FCompute<gpu>", BatchTakeOpForw
 
 NNVM_REGISTER_OP(one_hot).set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gpu>);
 
-NNVM_REGISTER_OP(gather_nd).set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);
+NNVM_REGISTER_OP(gather_nd)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);
 
 NNVM_REGISTER_OP(scatter_nd).set_attr<FCompute>("FCompute<gpu>", ScatterNDForward<gpu>);
 
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 492055f4aef2..3e54c5ed647a 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -18,6 +18,7 @@
 import sys
 import os
 import time
+import random
 import mxnet as mx
 import multiprocessing as mp
 from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, rand_ndarray, environment
@@ -28,7 +29,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import assert_raises_cudnn_not_satisfied, run_in_spawned_process
+from common import assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed
 from test_gluon import *
 from test_loss import *
 from test_numpy_loss import *
@@ -595,3 +596,110 @@ def test_cudnn_dropout_reproducibility():
 
     assert_almost_equal(a.grad, b.grad)
 
+def test_cuda_graphs():
+    class GraphTester(gluon.HybridBlock):
+        def __init__(self, function_to_test, **kwargs):
+            super(GraphTester, self).__init__(**kwargs)
+            with self.name_scope():
+                self.f = function_to_test()
+
+        def hybrid_forward(self, F, *args):
+            # We need to isolate the operation to be fully inside the graph
+            # in order for graphs usage to be possible
+            copied_args = [F.identity(a) for a in args]
+            outputs = self.f(*copied_args)
+            if isinstance(outputs, (list, tuple)):
+                return [F.identity(o) for o in outputs]
+            else:
+                return F.identity(outputs)
+
+    class TestDesc:
+        def __init__(self, name, f, num_inputs=1, input_dim=4):
+            self.name = name
+            self.f = f
+            self.num_inputs = num_inputs
+            self.input_dim = input_dim
+
+        def generate_inputs(self):
+            shape = tuple(np.random.randint(4, 11, size=self.input_dim))
+            ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)]
+            for r in ret:
+                r.attach_grad()
+            return ret
+
+    tested_ops = [
+            TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2),
+            TestDesc('add_scalar', lambda: (lambda x: x + 0.5)),
+            TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))),
+            TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))),
+            TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)),
+            TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')),
+            #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)),
+            TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()),
+            TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()),
+            TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()),
+            TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()),
+            TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()),
+            TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()),
+            TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()),
+            TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()),
+            TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()),
+            TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)),
+            TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()),
+            TestDesc('ELU', lambda: mx.gluon.nn.ELU()),
+            TestDesc('SELU', lambda: mx.gluon.nn.SELU()),
+            TestDesc('Swish', lambda: mx.gluon.nn.Swish()),
+        ]
+
+    N = 10
+
+    with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
+                      'MXNET_USE_FUSION': '0'}):
+        for test_desc in tested_ops:
+            print("Testing ", test_desc.name)
+            inputs = test_desc.generate_inputs()
+            inputsg = [i.copy() for i in inputs]
+            for i in inputsg:
+                i.attach_grad()
+            seed = random.randint(0, 10000)
+            net = GraphTester(test_desc.f)
+            netg = GraphTester(test_desc.f)
+
+            # initialize parameters
+            net.initialize()
+            netg.initialize()
+
+            net(*inputs)
+
+            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                p2.set_data(p1.data())
+
+            netg.hybridize(static_alloc=True, static_shape=True)
+
+            print("Testing inference mode")
+            with random_seed(seed):
+                for _ in range(N):
+                    assert_almost_equal(net(*inputs), netg(*inputsg))
+
+            mx.nd.waitall()
+            print("Testing training mode")
+            for _ in range(N):
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        out = net(*inputs)
+                    out.backward()
+
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        outg = netg(*inputsg)
+                    outg.backward()
+
+                assert_almost_equal(out, outg)
+                for i, ig in zip(inputs, inputsg):
+                    assert_almost_equal(i.grad, ig.grad)
+
+                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                    assert_almost_equal(p1.data(), p2.data())
+                    if p1.grad_req != 'null':
+                        assert_almost_equal(p1.grad(), p2.grad())
+            mx.nd.waitall()

From 9ca54abd1eb874ee8ba5e0362269daadeed1d149 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 9 Jun 2021 18:31:50 -0700
Subject: [PATCH 02/30] Fix compile and test_cuda_graphs

---
 src/imperative/cuda_graphs.h       | 2 +-
 tests/python/gpu/test_gluon_gpu.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index e669d7d1d2e3..2dbebdffab54 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -32,7 +32,7 @@
 #include <set>
 
 #include "./exec_pass.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 #if MXNET_USE_CUDA
 #define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020)
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 3e54c5ed647a..f58322d101bb 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -600,8 +600,7 @@ def test_cuda_graphs():
     class GraphTester(gluon.HybridBlock):
         def __init__(self, function_to_test, **kwargs):
             super(GraphTester, self).__init__(**kwargs)
-            with self.name_scope():
-                self.f = function_to_test()
+            self.f = function_to_test()
 
         def hybrid_forward(self, F, *args):
             # We need to isolate the operation to be fully inside the graph

From 0b4ed478d159385f85d9a3a3fffe56015cf203fd Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 9 Jun 2021 20:19:26 -0700
Subject: [PATCH 03/30] Fix lint

---
 src/imperative/cuda_graphs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 2dbebdffab54..e68d6a6a41d9 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -22,8 +22,8 @@
  * \file cuda_graphs.h
  * \brief Wrappers for use of CUDA Graphs API
  */
-#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_
-#define MXNET_EXECUTOR_CUDA_GRAPHS_H_
+#ifndef MXNET_IMPERATIVE_CUDA_GRAPHS_H_
+#define MXNET_IMPERATIVE_CUDA_GRAPHS_H_
 
 #include <mxnet/base.h>
 #include <vector>
@@ -544,4 +544,4 @@ class CudaGraphsExec {
 
 #endif  // CUDA_GRAPHS_AVAILABLE
 
-#endif  // MXNET_EXECUTOR_CUDA_GRAPHS_H_
+#endif  // MXNET_IMPERATIVE_CUDA_GRAPHS_H_

From cc694868a6678ed6157b80555e4e00ca3e187cf3 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 24 Jun 2021 18:56:53 -0700
Subject: [PATCH 04/30] Mark more ops as not CUDA Graphs compatible

---
 src/imperative/cuda_graphs.h                   |  9 +--------
 src/operator/contrib/index_array.cu            |  7 ++++++-
 src/operator/contrib/multi_lamb.cu             |  8 ++++++++
 src/operator/numpy/np_matrix_op.cu             | 16 +++++++++++++---
 src/operator/numpy/np_nonzero_op.cu            |  4 ++++
 src/operator/tensor/elemwise_unary_op_basic.cu |  7 ++++++-
 src/operator/tensor/matrix_op.cu               | 16 +++++++++++++---
 7 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index e68d6a6a41d9..bab3a062d314 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -67,10 +67,7 @@ inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
   return graphNodes;
 }
 
-// It does not really involve RTC, but requires libcuda.so,
-// which is linked only when RTC is enabled.
-#if MXNET_ENABLE_CUDA_RTC
-
+// Create a description of a CUDA Graph node
 inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
   std::stringstream ss;
 
@@ -141,8 +138,6 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
   return ss.str();
 }
 
-#endif  // MXNET_ENABLE_CUDA_RTC
-
 // CUDA Graphs are managed in RAII fashion by smart pointers below.
 // Function objects (preferred for readability) provide the deleter function.
 class CudaGraphDeleter {
@@ -241,12 +236,10 @@ class CudaGraphsSubSegExec {
       std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
       size_t num_nodes = graph_nodes.size();
       LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
-#if MXNET_ENABLE_CUDA_RTC
       for (size_t i = 0; i != num_nodes; ++i) {
         LOG(INFO) << "    node " << i << " = "
                   << CudaGraphNodeToString(graph_nodes[i]);
       }
-#endif  // MXNET_ENABLE_CUDA_RTC
     }
   }
 
diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu
index 482cbf6b8150..d317ccdf7059 100644
--- a/src/operator/contrib/index_array.cu
+++ b/src/operator/contrib/index_array.cu
@@ -82,7 +82,12 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs& attrs,
   }
 }
 
-NNVM_REGISTER_OP(_contrib_index_array).set_attr<FCompute>("FCompute<gpu>", IndexArrayForwardGPU);
+NNVM_REGISTER_OP(_contrib_index_array)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", IndexArrayForwardGPU);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 118ec6348ed7..8361e04cabb3 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -268,9 +268,17 @@ void CallKernel2(Stream<gpu>* s,
 }
 
 NNVM_REGISTER_OP(_multi_lamb_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
     .set_attr<FCompute>("FCompute<gpu>", MultiLAMBUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_lamb_update)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
     .set_attr<FCompute>("FCompute<gpu>", MultiLAMBUpdate<gpu, true>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index f2078146c78e..23b149973f6c 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -92,9 +92,19 @@ void NumpyFlipForwardImpl<gpu>(const OpContext& ctx,
   });
 }
 
-NNVM_REGISTER_OP(_npi_flip).set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
-
-NNVM_REGISTER_OP(_backward_npi_flip).set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
+NNVM_REGISTER_OP(_npi_flip)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_npi_flip)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
 
 NNVM_REGISTER_OP(_npi_moveaxis).set_attr<FCompute>("FCompute<gpu>", NumpyMoveaxisCompute<gpu>);
 
diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu
index 1499030dbe9b..8ae39bdf2c4c 100644
--- a/src/operator/numpy/np_nonzero_op.cu
+++ b/src/operator/numpy/np_nonzero_op.cu
@@ -115,6 +115,10 @@ NNVM_REGISTER_OP(_npx_nonzero)
                                 [](const NodeAttrs& attrs) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                [](const NodeAttrs& attrs, const bool) {
+                                  return false;
+                                })
     .set_attr<FComputeEx>("FComputeEx<gpu>", NonzeroForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 7fdc047630cb..baa36c6be587 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -115,7 +115,12 @@ void ShapeComputeGPU(const nnvm::NodeAttrs& attrs,
                   mshadow::Stream<gpu>::GetStream(s));
 }
 
-NNVM_REGISTER_OP(shape_array).set_attr<FCompute>("FCompute<gpu>", ShapeComputeGPU);
+NNVM_REGISTER_OP(shape_array)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", ShapeComputeGPU);
 
 void SizeComputeGPU(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index b5bd1c96d25b..dc7d608dacb9 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -412,9 +412,19 @@ NNVM_REGISTER_OP(tile).set_attr<FCompute>("FCompute<gpu>", TileOpForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_tile).set_attr<FCompute>("FCompute<gpu>", TileOpBackward<gpu>);
 
-NNVM_REGISTER_OP(reverse).set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
-
-NNVM_REGISTER_OP(_backward_reverse).set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
+NNVM_REGISTER_OP(reverse)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_reverse)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
 
 NNVM_REGISTER_OP(stack).set_attr<FCompute>("FCompute<gpu>", StackOpForward<gpu>);
 

From e79c111a1776635c92b6e900c37674bed531907a Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 27 May 2021 20:59:00 -0700
Subject: [PATCH 05/30] Mark some linalg ops as not CUDA Graphs compatible

---
 src/imperative/cuda_graphs.h       | 18 ++++++++----
 src/operator/numpy/np_matrix_op.cu | 29 +++++++++++++++++--
 src/operator/tensor/la_op.cu       | 46 ++++++++++++++++++++++++++++--
 3 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index bab3a062d314..97424cfac074 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -189,11 +189,19 @@ class CudaGraphsSubSegExec {
 
     cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
     cudaGraphNode_t error_node;
-    CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
-                                  &error_node, &update_result));
-    // If update fails make a new executor, discarding old one.
-    if (update_result != cudaGraphExecUpdateSuccess)
-      MakeGraphExec();
+    cudaError_t e = cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
+                                        &error_node, &update_result);
+    switch (e) {
+      case cudaErrorGraphExecUpdateFailure:
+        MakeGraphExec();
+        break;
+      case cudaSuccess:
+        CHECK_EQ(update_result, cudaGraphExecUpdateSuccess);
+        break;
+      default:
+        // Respond normally to unusual cudaGraphExecUpdate() ret vals
+        CUDA_CALL(e);
+    }
   }
 
   void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 23b149973f6c..ed9ab81f2066 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -52,9 +52,19 @@ NNVM_REGISTER_OP(_npi_column_stack)
 NNVM_REGISTER_OP(_backward_np_column_stack)
     .set_attr<FCompute>("FCompute<gpu>", NumpyColumnStackBackward<gpu>);
 
-NNVM_REGISTER_OP(_npi_tril_indices).set_attr<FCompute>("FCompute<gpu>", TrilindicesOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_tril_indices)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", TrilindicesOpForward<gpu>);
 
-NNVM_REGISTER_OP(_npi_roll).set_attr<FCompute>("FCompute<gpu>", NumpyRollCompute<gpu>);
+NNVM_REGISTER_OP(_npi_roll)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyRollCompute<gpu>);
 
 template <>
 void NumpyFlipForwardImpl<gpu>(const OpContext& ctx,
@@ -113,7 +123,20 @@ NNVM_REGISTER_OP(_npi_rollaxis).set_attr<FCompute>("FCompute<gpu>", NumpyRollaxi
 NNVM_REGISTER_OP(_npi_rollaxis_backward)
     .set_attr<FCompute>("FCompute<gpu>", NumpyRollaxisBackward<gpu>);
 
-NNVM_REGISTER_OP(_npi_rot90).set_attr<FCompute>("FCompute<gpu>", NumpyRot90Compute<gpu>);
+NNVM_REGISTER_OP(_npi_rot90)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs& attrs, const bool) {
+        const auto& param = nnvm::get<NumpyRot90Param>(attrs.parsed);
+        // Should track code in NumpyRot90Compute()
+        int real_k(param.k);
+        real_k = real_k % 4;
+        if (real_k < 0) {
+          real_k += 4;
+        }
+        // Avoid NumpyRot90ComputeFlipIml(), which uses mshadow::Copy()
+        return real_k != 2;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyRot90Compute<gpu>);
 
 NNVM_REGISTER_OP(_npi_hsplit).set_attr<FCompute>("FCompute<gpu>", HSplitOpForward<gpu>);
 
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
index 1f16e2d58251..8cae4a4d5be8 100644
--- a/src/operator/tensor/la_op.cu
+++ b/src/operator/tensor/la_op.cu
@@ -88,6 +88,10 @@ NNVM_REGISTER_OP(_backward_linalg_maketrian)
     .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 1, 1, 1, copytrian>);
 
 NNVM_REGISTER_OP(_linalg_potri)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potri>);
 
 NNVM_REGISTER_OP(_backward_linalg_potri)
@@ -99,32 +103,70 @@ NNVM_REGISTER_OP(_linalg_inverse)
 NNVM_REGISTER_OP(_backward_linalg_inverse)
     .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, inverse_backward>);
 
-NNVM_REGISTER_OP(_linalg_det).set_attr<FCompute>("FCompute<gpu>", LaOpDetForward<gpu, 1, det>);
+NNVM_REGISTER_OP(_linalg_det)
+    // Incompatibility comes from allocs made in linalg_batch_getrf(), called by det::op()
+    // see https://github.com/apache/incubator-mxnet/issues/19353
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", LaOpDetForward<gpu, 1, det>);
 
 NNVM_REGISTER_OP(_backward_linalg_det)
+    // Incompatibility comes from allocs made in linalg_batch_getri(),
+    // called by linalg_batch_det_backward_helper, called by det_backward::op()
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetBackward<gpu, 1, det_backward>);
 
 NNVM_REGISTER_OP(_linalg_slogdet)
+    // Incompatibility comes from allocs made in linalg_batch_getrf(),
+    // called by slogdet::op().
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetForward<gpu, 2, slogdet>);
 
 NNVM_REGISTER_OP(_backward_linalg_slogdet)
+    // Incompatibility comes from allocs made in linalg_batch_getri(),
+    // called by linalg_batch_det_backward_helper, called by slogdet_backward::op()
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetBackward<gpu, 2, slogdet_backward>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_linalg_potrf)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potrf>);
 
 NNVM_REGISTER_OP(_backward_linalg_potrf)
     .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, potrf_backward>);
 
 NNVM_REGISTER_OP(_linalg_gelqf)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 2, gelqf>);
 
 NNVM_REGISTER_OP(_backward_linalg_gelqf)
     .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 1, gelqf_backward>);
 
-NNVM_REGISTER_OP(_linalg_syevd).set_attr<FCompute>("FCompute<gpu>", LaOpForwSyevd<gpu, syevd>);
+NNVM_REGISTER_OP(_linalg_syevd)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", LaOpForwSyevd<gpu, syevd>);
 
 NNVM_REGISTER_OP(_backward_linalg_syevd)
     .set_attr<FCompute>("FCompute<gpu>", LaOpBackwSyevd<gpu, syevd_backward>);

From 4a2dae456758afa2fda0c1c90b7562ae44108a8f Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 28 May 2021 16:45:35 -0700
Subject: [PATCH 06/30] Marked 2 ops CUDA Graphs incompatible due to cpu->gpu
 copy

---
 src/operator/numpy/np_pad_op.cu  | 16 ++++++++++++++--
 src/operator/tensor/matrix_op.cu | 16 ++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/operator/numpy/np_pad_op.cu b/src/operator/numpy/np_pad_op.cu
index 01a7035ab42d..97cb423b7bab 100644
--- a/src/operator/numpy/np_pad_op.cu
+++ b/src/operator/numpy/np_pad_op.cu
@@ -28,9 +28,21 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npi_pad).set_attr<FCompute>("FCompute<gpu>", NumpyPadOpForward<gpu>);
+NNVM_REGISTER_OP(_npi_pad)
+    // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpForward
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs&, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyPadOpForward<gpu>);
 
-NNVM_REGISTER_OP(_backward_npi_pad).set_attr<FCompute>("FCompute<gpu>", NumpyPadOpBackward<gpu>);
+NNVM_REGISTER_OP(_backward_npi_pad)
+    // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpBackward
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+      [](const NodeAttrs&, const bool) {
+        return false;
+      })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyPadOpBackward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index dc7d608dacb9..e26b767802a3 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -439,9 +439,21 @@ NNVM_REGISTER_OP(depth_to_space).set_attr<FCompute>("FCompute<gpu>", DepthToSpac
 
 NNVM_REGISTER_OP(space_to_depth).set_attr<FCompute>("FCompute<gpu>", SpaceToDepthOpForward<gpu>);
 
-NNVM_REGISTER_OP(_split_v2).set_attr<FCompute>("FCompute<gpu>", SplitOpForwardGPU);
+NNVM_REGISTER_OP(_split_v2)
+    // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpForwardImpl
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", SplitOpForwardGPU);
 
-NNVM_REGISTER_OP(_split_v2_backward).set_attr<FCompute>("FCompute<gpu>", SplitOpBackward<gpu>);
+NNVM_REGISTER_OP(_split_v2_backward)
+    // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpBackwardImpl
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+        [](const NodeAttrs&, const bool) {
+          return false;
+        })
+    .set_attr<FCompute>("FCompute<gpu>", SplitOpBackward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet

From 64e8555c454a60e79904be917ef9cf0793f1acd3 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 28 May 2021 21:56:14 -0700
Subject: [PATCH 07/30] Mark cuDNN Dropout as fully CUDA Graphs compatible. 
 Reenable tests.

---
 src/operator/nn/dropout.cu         | 16 ++++++++++++----
 tests/python/gpu/test_gluon_gpu.py |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index bff9b020126c..5e8f55881af6 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -30,10 +30,18 @@ namespace op {
 
 NNVM_REGISTER_OP(Dropout)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-                                       [](const NodeAttrs&, const bool is_train) {
-        // Dropout is just passthrough during inference
-        return !is_train;
-        })
+      [](const NodeAttrs& attrs, const bool is_train) {
+        // Dropout is just passthrough during inference for all impls
+        if (!is_train)
+          return true;
+    
+        // cuDNN impl is compatible during training as well
+        const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+        real_t pkeep = 1.0f - param.p;
+        bool cudnn_off = param.cudnn_off && param.cudnn_off.value();
+        bool cudnn_available = pkeep > 0 && !cudnn_off;
+        return MXNET_USE_CUDNN_DROPOUT && cudnn_available;
+      })
     .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index f58322d101bb..af6805c9cd5c 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -633,7 +633,7 @@ def generate_inputs(self):
             TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))),
             TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)),
             TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')),
-            #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)),
+            TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)),
             TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()),
             TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()),
             TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()),

From 78215fa94425c56b26d43c9a4cd7a49181cbfb81 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 17 Feb 2022 13:08:11 -0800
Subject: [PATCH 08/30] clang-tidy fixes

---
 src/imperative/attach_op_execs_pass.cc        |  34 ++-
 src/imperative/cuda_graphs.h                  | 285 +++++++++---------
 src/imperative/exec_pass.h                    |  11 +-
 src/imperative/imperative_utils.h             |   8 +-
 src/operator/contrib/adamw.cu                 |  17 +-
 src/operator/contrib/index_array.cu           |   4 +-
 src/operator/contrib/multi_lamb.cu            |   8 +-
 src/operator/nn/dropout.cu                    |  24 +-
 src/operator/numpy/linalg/np_eig.cu           |   8 +-
 src/operator/numpy/linalg/np_eigvals.cu       |   8 +-
 src/operator/numpy/linalg/np_norm_backward.cu |   9 +-
 src/operator/numpy/linalg/np_norm_forward.cu  |   9 +-
 src/operator/numpy/np_boolean_mask_assign.cu  |   8 +-
 src/operator/numpy/np_constraint_check.cu     |   4 +-
 src/operator/numpy/np_matrix_op.cu            |  40 ++-
 src/operator/numpy/np_nonzero_op.cu           |   4 +-
 src/operator/numpy/np_pad_op.cu               |   8 +-
 src/operator/numpy/np_percentile_op.cu        |   4 +-
 .../numpy/random/np_exponential_op.cu         |  10 +-
 src/operator/numpy/random/np_gamma_op.cu      |   4 +-
 .../numpy/random/np_multinomial_op.cu         |   4 +-
 src/operator/numpy/random/np_normal_op.cu     |   8 +-
 src/operator/numpy/random/np_pareto_op.cu     |   4 +-
 src/operator/numpy/random/np_power_op.cu      |   4 +-
 src/operator/numpy/random/np_rayleigh_op.cu   |   4 +-
 .../tensor/elemwise_unary_op_basic.cu         |   4 +-
 src/operator/tensor/indexing_op.cu            |   4 +-
 src/operator/tensor/la_op.cu                  |  32 +-
 src/operator/tensor/matrix_op.cu              |  16 +-
 29 files changed, 260 insertions(+), 327 deletions(-)

diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
index 719c9b3165fe..51736c577e5b 100644
--- a/src/imperative/attach_op_execs_pass.cc
+++ b/src/imperative/attach_op_execs_pass.cc
@@ -48,8 +48,8 @@ namespace exec {
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
   explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
-                            DispatchMode dispatch_mode,
-                            std::vector<uint32_t> mutate_idx)
+                                     DispatchMode dispatch_mode,
+                                     std::vector<uint32_t> mutate_idx)
       : OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {}
 
   void Setup() override {
@@ -253,10 +253,13 @@ class FComputeExExecutor : public OpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
-                              FComputeEx fcompute, ExecType exec_type)
-      : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
-      : attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
+  explicit FComputeExExecutor(const NodeAttrs& attrs,
+                              DispatchMode dispatch_mode,
+                              FComputeEx fcompute,
+                              ExecType exec_type)
+      : OpExecutor(attrs, dispatch_mode),
+        fcompute_(std::move(fcompute)),
+        exec_type_(exec_type) {}
 
  private:
   FComputeEx fcompute_;
@@ -312,8 +315,11 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         common::GetFCompute<FStatefulComputeEx>(op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type);
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+                                                           dispatch_modes[i],
+                                                           state,
+                                                           fcompute_ex,
+                                                           exec_type);
     } else {
       FStatefulCompute fcompute =
           common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", vctx[i]);
@@ -333,7 +339,10 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
-          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute_ex, exec_type);
+                                                           dispatch_modes[i],
+                                                           ret[fwd_id].get()->state(),
+                                                           fcompute_ex,
+                                                           exec_type);
     } else {
       FStatefulCompute fcompute =
           common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", vctx[i]);
@@ -341,8 +350,11 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
       ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
-          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
-          mutate_index);
+                                                         dispatch_modes[i],
+                                                         ret[fwd_id].get()->state(),
+                                                         fcompute,
+                                                         exec_type,
+                                                         mutate_index);
     }
   } else {
     FCompute fcompute   = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 97424cfac074..a264a6edbfc4 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -78,62 +78,71 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
   CUgraphNodeType t;
   CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
   switch (t) {
-    case CU_GRAPH_NODE_TYPE_KERNEL:
-      {
-        CUDA_KERNEL_NODE_PARAMS kparams;
-        auto err = cuGraphKernelNodeGetParams(cu_node, &kparams);
-        if (err == CUDA_SUCCESS) {
-          ss << "GPUKernel@" << kparams.func;
-          dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ);
-          dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ);
-          ss << "<<<gridDim=" << CudaDim3ToString(gridDim)
-             << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>";
-          ss << "(...";
-          if (kparams.sharedMemBytes != 0)
-            ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes;
-          ss << ")";
-        } else {
-          ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err;
-        }
+    case CU_GRAPH_NODE_TYPE_KERNEL: {
+      CUDA_KERNEL_NODE_PARAMS kparams;
+      auto err = cuGraphKernelNodeGetParams(cu_node, &kparams);
+      if (err == CUDA_SUCCESS) {
+        ss << "GPUKernel@" << kparams.func;
+        dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ);
+        dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ);
+        ss << "<<<gridDim=" << CudaDim3ToString(gridDim)
+           << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>";
+        ss << "(...";
+        if (kparams.sharedMemBytes != 0)
+          ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes;
+        ss << ")";
+      } else {
+        ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err;
       }
-      break;
-    case CU_GRAPH_NODE_TYPE_MEMCPY:
-      {
-        cudaMemcpy3DParms mparams = {};
-        CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams));
-        // If memcpy is seen, return without setting up runnable executor
-        switch (mparams.kind) {
-          case cudaMemcpyHostToHost: ss << "Host->Host "; break;
-          case cudaMemcpyHostToDevice: ss << "Host->Device "; break;
-          case cudaMemcpyDeviceToHost: ss << "Device->Host "; break;
-          case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break;
-          default: break;
-        }
-        ss << "Memcpy";
+    } break;
+    case CU_GRAPH_NODE_TYPE_MEMCPY: {
+      cudaMemcpy3DParms mparams = {};
+      CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams));
+      // If memcpy is seen, return without setting up runnable executor
+      switch (mparams.kind) {
+        case cudaMemcpyHostToHost:
+          ss << "Host->Host ";
+          break;
+        case cudaMemcpyHostToDevice:
+          ss << "Host->Device ";
+          break;
+        case cudaMemcpyDeviceToHost:
+          ss << "Device->Host ";
+          break;
+        case cudaMemcpyDeviceToDevice:
+          ss << "Device->Device ";
+          break;
+        default: break;
       }
-      break;
-    case CU_GRAPH_NODE_TYPE_MEMSET:
-      {
-        cudaMemsetParams mparams = {};
-        CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams));
-        if (mparams.height == 1 && mparams.elementSize == 1) {
-          ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value
-             << ", count=" << mparams.width << ")";
-        } else {
-          if (mparams.elementSize == 1)
-            ss << "cudaMemset2D";
-          else
-            ss << "MemSet<elemBytes=" << mparams.elementSize << ">";
-          ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch
-             << ", value=" << mparams.value << ", width=" << mparams.width
-             << ", height=" << mparams.height << ")";
-        }
+      ss << "Memcpy";
+    } break;
+    case CU_GRAPH_NODE_TYPE_MEMSET: {
+      cudaMemsetParams mparams = {};
+      CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams));
+      if (mparams.height == 1 && mparams.elementSize == 1) {
+        ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value
+           << ", count=" << mparams.width << ")";
+      } else {
+        if (mparams.elementSize == 1)
+          ss << "cudaMemset2D";
+        else
+          ss << "MemSet<elemBytes=" << mparams.elementSize << ">";
+        ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch
+           << ", value=" << mparams.value << ", width=" << mparams.width
+           << ", height=" << mparams.height << ")";
       }
+    } break;
+    case CU_GRAPH_NODE_TYPE_HOST:
+      ss << "Host (executable) node";
+      break;
+    case CU_GRAPH_NODE_TYPE_GRAPH:
+      ss << "Node which executes an embedded graph";
+      break;
+    case CU_GRAPH_NODE_TYPE_EMPTY:
+      ss << "Empty (no-op) node";
       break;
-    case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break;
-    case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break;
-    case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break;
-    default: ss << "Unknown/Invalid node type " << t;
+    default:
+      ss << "Unknown/Invalid node type " << t;
   }
   return ss.str();
 }
@@ -142,7 +151,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
 // Function objects (preferred for readability) provide the deleter function.
 class CudaGraphDeleter {
  public:
-  void operator() (cudaGraph_t graph) {
+  void operator()(cudaGraph_t graph) {
     if (graph != nullptr)
       CUDA_CALL(cudaGraphDestroy(graph));
   }
@@ -152,7 +161,7 @@ class CudaGraphDeleter {
 // Function objects (preferred for readability) provide the deleter function.
 class CudaGraphExecDeleter {
  public:
-  void operator() (cudaGraphExec_t graph_exec) {
+  void operator()(cudaGraphExec_t graph_exec) {
     if (graph_exec != nullptr)
       CUDA_CALL(cudaGraphExecDestroy(graph_exec));
   }
@@ -162,25 +171,22 @@ class CudaGraphExecDeleter {
 // characterized by a starting index in the OpExecutor list and a number of ops.
 class CudaGraphsSubSegExec {
  public:
-  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-                       const RunContext &rctx,
+  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+                       const RunContext& rctx,
                        bool is_gpu,
                        bool verbose,
                        int from_op_idx,
                        int num_ops,
-                       bool ops_are_cuda_graph_compatible = true) :
-  from_op_idx_(from_op_idx),
-  num_ops_(num_ops),
-  graph_(nullptr),
-  graph_exec_(nullptr) {
+                       bool ops_are_cuda_graph_compatible = true)
+    : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) {
     if (ops_are_cuda_graph_compatible) {
       MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
       MakeGraphExec();
     }
   }
 
-  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-              const RunContext &rctx,
+  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+              const RunContext& rctx,
               bool is_gpu,
               bool verbose) {
     // Current executor should be Runnable with the same parameters
@@ -189,8 +195,8 @@ class CudaGraphsSubSegExec {
 
     cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
     cudaGraphNode_t error_node;
-    cudaError_t e = cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
-                                        &error_node, &update_result);
+    cudaError_t e =
+        cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result);
     switch (e) {
       case cudaErrorGraphExecUpdateFailure:
         MakeGraphExec();
@@ -204,11 +210,11 @@ class CudaGraphsSubSegExec {
     }
   }
 
-  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-                 const RunContext &rctx,
+  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+                 const RunContext& rctx,
                  bool is_gpu) {
     if (IsRunnable()) {
-      auto s = rctx.get_stream<gpu>();
+      auto s                  = rctx.get_stream<gpu>();
       const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
       CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s));
     } else {
@@ -218,16 +224,18 @@ class CudaGraphsSubSegExec {
     }
   }
 
-  bool IsRunnable() { return graph_exec_ != nullptr; }
+  bool IsRunnable() {
+    return graph_exec_ != nullptr;
+  }
 
  private:
-  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-                       const RunContext &rctx,
-                       bool is_gpu,
-                       bool verbose,
-                       int from_op_idx,
-                       int num_ops) {
-    auto s = rctx.get_stream<gpu>();
+  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+                 const RunContext& rctx,
+                 bool is_gpu,
+                 bool verbose,
+                 int from_op_idx,
+                 int num_ops) {
+    auto s                  = rctx.get_stream<gpu>();
     const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
     // Create CUDA Graph
     // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers
@@ -242,38 +250,36 @@ class CudaGraphsSubSegExec {
 
     if (verbose) {
       std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
-      size_t num_nodes = graph_nodes.size();
+      size_t num_nodes                         = graph_nodes.size();
       LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
       for (size_t i = 0; i != num_nodes; ++i) {
-        LOG(INFO) << "    node " << i << " = "
-                  << CudaGraphNodeToString(graph_nodes[i]);
+        LOG(INFO) << "    node " << i << " = " << CudaGraphNodeToString(graph_nodes[i]);
       }
     }
   }
 
   void MakeGraphExec() {
-      cudaGraphExec_t cuda_graph_exec;
-      cudaGraphNode_t error_node;
-      char log_buffer[1000];
-
-      CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(),
-                                     &error_node, log_buffer, 1000));
-      graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
-
-      // At this point we have a CUDA Graph executor
-      static int num_graph_creations_logged = 0;
-      static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
-      if (num_graph_creations_logged < max_log_entries) {
-        num_graph_creations_logged++;
-        LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
-        if (num_graph_creations_logged == max_log_entries)
-          LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
-      }
+    cudaGraphExec_t cuda_graph_exec;
+    cudaGraphNode_t error_node;
+    char log_buffer[1000];
+
+    CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), &error_node, log_buffer, 1000));
+    graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
+
+    // At this point we have a CUDA Graph executor
+    static int num_graph_creations_logged = 0;
+    static int max_log_entries            = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
+    if (num_graph_creations_logged < max_log_entries) {
+      num_graph_creations_logged++;
+      LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
+      if (num_graph_creations_logged == max_log_entries)
+        LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
+    }
   }
 
   int from_op_idx_;
   int num_ops_;
-  using cudaGraphStruct_t = typename std::remove_pointer<cudaGraph_t>::type;
+  using cudaGraphStruct_t     = typename std::remove_pointer<cudaGraph_t>::type;
   using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type;
   std::shared_ptr<cudaGraphStruct_t> graph_;
   std::shared_ptr<cudaGraphExecStruct_t> graph_exec_;
@@ -283,7 +289,7 @@ class CudaGraphsSubSegExec {
 struct CudaGraphInfo {
   std::vector<CudaGraphsSubSegExec> cuda_graph_subseg_execs;
   bool has_been_run_conventionally = false;
-  std::vector<void *> tempspace_dptrs;
+  std::vector<void*> tempspace_dptrs;
 };
 // A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and
 // the state of the is_train flag of the OpContext.  If the tempspace_dptrs change, we
@@ -292,7 +298,7 @@ struct CudaGraphCacheKey {
   cudaStream_t cu_s;
   bool is_train;
   // overload '<' so CudaGraphCacheKey can be used as a std::map key
-  bool operator<(const CudaGraphCacheKey &other) const {
+  bool operator<(const CudaGraphCacheKey& other) const {
     return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train);
   }
 };
@@ -300,20 +306,20 @@ using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>;
 
 class CudaGraphsExec {
  public:
-  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
                  bool is_gpu,
-                 const char *opr_names) :
-  verbose_(false), is_enabled_(false) {
+                 const char *opr_names)
+      : verbose_(false), is_enabled_(false) {
     opr_names_ = opr_names ? std::string(opr_names) : std::string();
     if (is_gpu) {
       is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false);
-      verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false);
+      verbose_    = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false);
       SetTempSpaces(exec_list);
     }
   }
 
-  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-              const RunContext &rctx,
+  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+              const RunContext& rctx,
               bool is_gpu) {
     // If this a CPU op or CUDA Graphs use isn't possible, run normally and return
     if (!is_gpu || !is_enabled_) {
@@ -324,14 +330,14 @@ class CudaGraphsExec {
 
     // Also if we're in a warm-up period where tempspace pointers are likely
     // to change, run normally and return
-    auto s = rctx.get_stream<gpu>();
+    auto s                  = rctx.get_stream<gpu>();
     const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
     // All the ops in the bulked segment will have the same setting of is_train as the first op
-    const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
+    const bool is_train         = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
     const CudaGraphCacheKey key = {cu_s, is_train};
     // Look-up the CUDA Graph info for this combo of stream and is_train setting
     // This may create a default-initialized new entry.
-    auto &cuda_graph_info = cache_[key];
+    auto& cuda_graph_info = cache_[key];
     if (!cuda_graph_info.has_been_run_conventionally) {
       // Run all opr in the sub-graph
       exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
@@ -347,7 +353,7 @@ class CudaGraphsExec {
     if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 &&
         cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) {
       // Update all runnable executors.  Non-runnable executors launch their ops conventionally.
-      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+      for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
         if (subseg_exec.IsRunnable())
           subseg_exec.Update(exec_list, rctx, is_gpu, verbose_);
       }
@@ -365,7 +371,9 @@ class CudaGraphsExec {
             break;
         }
         if (num_good_ops > 0) {
-          CreateSubExecOverRegion(exec_list, rctx, is_gpu,
+          CreateSubExecOverRegion(exec_list,
+                                  rctx,
+                                  is_gpu,
                                   first_op_idx,
                                   first_op_idx + num_good_ops,
                                   &cuda_graph_info.cuda_graph_subseg_execs);
@@ -375,7 +383,7 @@ class CudaGraphsExec {
           // We had to have hit an op that was not OK.
           if (verbose_) {
             LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]"
-                      << " of op segment "  << opr_names_;
+                      << " of op segment " << opr_names_;
           }
           CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false);
           cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg);
@@ -393,19 +401,19 @@ class CudaGraphsExec {
     // Now execute the CUDA Graph that we either just created or looked-up in the cache.
     if (verbose_) {
       int runnable_execs = 0;
-      int bypassed_ops = 0;
-      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+      int bypassed_ops   = 0;
+      for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
         if (subseg_exec.IsRunnable())
           runnable_execs++;
         else
           bypassed_ops++;
       }
-      LOG(INFO) << "Launching " << runnable_execs
-                << " captured CUDA graph(s) for op segment " << opr_names_;
+      LOG(INFO) << "Launching " << runnable_execs << " captured CUDA graph(s) for op segment "
+                << opr_names_;
       if (bypassed_ops > 0)
         LOG(INFO) << "    (bypassing " << bypassed_ops << " un-capturable ops)";
     }
-    for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs)
+    for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs)
       subseg_exec.RunSubSeg(exec_list, rctx, is_gpu);
   }
 
@@ -413,12 +421,12 @@ class CudaGraphsExec {
   // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx).  If such a graph
   // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a
   // number of smaller graphs that avoid those ops with the memcpys.
-  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
-                               const RunContext &rctx,
+  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+                               const RunContext& rctx,
                                bool is_gpu,
                                size_t from_op_idx,
                                size_t upto_op_idx,
-                               std::vector<CudaGraphsSubSegExec> *cuda_graph_subseg_execs) {
+                               std::vector<CudaGraphsSubSegExec>* cuda_graph_subseg_execs) {
     // Optimistically try to create a CUDA Graph of the entire op segment region
 
     int num_ops = upto_op_idx - from_op_idx;
@@ -429,21 +437,20 @@ class CudaGraphsExec {
       if (verbose_)
         LOG(INFO) << "  Graph was not runnable- creating op sub-segments...";
       // Enter fall-back approach to making many sub-execs
-      for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) {
+      for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx;) {
         int num_good_ops = 0;
         for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) {
           CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1);
           if (single_opseg.IsRunnable())
             num_good_ops++;
           // Is it time to create a subseg exec from accumulated good ops?
-          if (num_good_ops > 0 &&
-              (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) {
+          if (num_good_ops > 0 && (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) {
             if (verbose_)
-              LOG(INFO) << "Capturing CUDA graph of op sub segment["
-                        << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]"
+              LOG(INFO) << "Capturing CUDA graph of op sub segment[" << first_op_idx << ":"
+                        << (first_op_idx + num_good_ops - 1) << "]"
                         << " of op segment "  << opr_names_;
-            CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_,
-                                            first_op_idx, num_good_ops);
+            CudaGraphsSubSegExec good_opseg(
+                exec_list, rctx, is_gpu, verbose_, first_op_idx, num_good_ops);
             CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation";
             cuda_graph_subseg_execs->push_back(good_opseg);
             first_op_idx += num_good_ops;
@@ -452,7 +459,7 @@ class CudaGraphsExec {
           if (!single_opseg.IsRunnable()) {
             if (verbose_) {
               LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]"
-                        << " of op segment "  << opr_names_;
+                        << " of op segment " << opr_names_;
               // Generate throw-away exec in order to produce a diagnostic listing of graph nodes
               CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1);
             }
@@ -466,11 +473,11 @@ class CudaGraphsExec {
   }
 
   // Is the Op OK to make part of a CUDA Graph?
-  bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
-    static auto& fstateful = Op::GetAttr<FCreateOpState>("FCreateOpState");
+  bool OpOK(const std::shared_ptr<exec::OpExecutor>& exec) {
+    static auto& fstateful        = Op::GetAttr<FCreateOpState>("FCreateOpState");
     static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
-    static auto& fcompute_ex = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
-    const auto& attrs = exec->attrs;
+    static auto& fcompute_ex      = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
+    const auto& attrs             = exec->attrs;
     if (attrs.op != nullptr) {
       const auto f = fgraphcompatible.get(attrs.op, nullptr);
       if (f != nullptr) {
@@ -478,7 +485,7 @@ class CudaGraphsExec {
       }
       if (fstateful.get(attrs.op, nullptr) != nullptr) {
         if (verbose_) {
-          LOG(INFO) << "Omitting stateful operator " << attrs.op->name  << " from CUDA graph.";
+          LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph.";
         }
         return false;
       }
@@ -507,10 +514,10 @@ class CudaGraphsExec {
   }
 
   // Determine Tempspaces used by ops.  Other resource uses disable CUDA Graphs.
-  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list) {
+  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list) {
     // Gather info about the ops use of TempSpace.
     if (is_enabled_) {
-      std::set<Resource *> tempspaces_set;
+      std::set<Resource*> tempspaces_set;
       for (auto& exec : exec_list) {
         for (auto& resource : exec->op_ctx.requested) {
           if (resource.req.type == ResourceRequest::kTempSpace) {
@@ -523,18 +530,18 @@ class CudaGraphsExec {
   }
 
   // Return the addresses of the gpu TempSpace areas
-  std::vector<void *> GetGPUTempspacePtrs(mshadow::Stream<gpu> *s) {
-    std::vector<void *> ret;
+  std::vector<void*> GetGPUTempspacePtrs(mshadow::Stream<gpu>* s) {
+    std::vector<void*> ret;
     for (const auto& resource : tempspaces_) {
       // Ask for minimal allocation to get base pointer without increasing the size
-      auto *base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_;
-      ret.push_back(static_cast<void *>(base_ptr));
+      auto* base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_;
+      ret.push_back(static_cast<void*>(base_ptr));
     }
     return ret;
   }
 
   CudaGraphCache cache_;
-  std::vector<Resource *> tempspaces_;
+  std::vector<Resource*> tempspaces_;
   std::string opr_names_;
   bool verbose_;
   bool is_enabled_;
diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h
index 6da6c8d95c69..1b9ef071f676 100644
--- a/src/imperative/exec_pass.h
+++ b/src/imperative/exec_pass.h
@@ -90,8 +90,8 @@ class OpExecutor {
   /*! \brief dispatch mode of the executor */
   DispatchMode dispatch_mode;
 
-  explicit OpExecutor(NodeAttrs  attrs, DispatchMode dispatch_mode) :
-    attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {}
+  explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode)
+      : attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {}
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
   /*!
@@ -111,9 +111,10 @@ class OpExecutor {
    *  This function call does not synchronize the stream.
    * \param rctx The runtime context passed in by environment.
    */
-  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> > &execs,
-                     RunContext rctx, bool is_gpu) {
-    for (auto &exec : execs)
+  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> >& execs,
+                     RunContext rctx,
+                     bool is_gpu) {
+    for (auto& exec : execs)
       exec->Run(rctx, is_gpu);
   }
   /*! \return the execution type */
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index bd9fe783ab28..7f90528f4793 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -1254,10 +1254,10 @@ inline Engine::OprHandle CreateEngineOp(
   // Provide initialized `cuda_graphs_exec`, which when captured
   // by exec_fun, acts like a static variable inside the mutable closure.
   cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names);
-  auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] (
-      RunContext ctx,
-      Engine::CallbackOnStart on_start,
-      Engine::CallbackOnComplete on_complete) mutable {
+  auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu](
+                      RunContext ctx,
+                      Engine::CallbackOnStart on_start,
+                      Engine::CallbackOnComplete on_complete) mutable {
     on_start();
     if (is_async) {
       execs[0]->op_ctx.async_on_complete = on_complete;
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index 33fca6ccea67..0b247900ce4c 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -46,33 +46,24 @@ void GetScaleFloat<gpu>(mshadow::Stream<gpu>* s, const TBlob& scale_blob, float*
 
 NNVM_REGISTER_OP(_adamw_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", adamw::MPUpdate<gpu, AdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_mp_adamw_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", adamw::MPUpdate<gpu, MPAdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_multi_adamw_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", adamw::multiMPUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_adamw_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", adamw::multiMPUpdate<gpu, true>);
 
-
 }  // namespace adamw
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu
index d317ccdf7059..3702fed2a06a 100644
--- a/src/operator/contrib/index_array.cu
+++ b/src/operator/contrib/index_array.cu
@@ -84,9 +84,7 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_contrib_index_array)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", IndexArrayForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 8361e04cabb3..c6bedfc861f8 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -269,16 +269,12 @@ void CallKernel2(Stream<gpu>* s,
 
 NNVM_REGISTER_OP(_multi_lamb_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", MultiLAMBUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_lamb_update)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", MultiLAMBUpdate<gpu, true>);
 
 }  // namespace op
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 5e8f55881af6..98cddbc97634 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -30,18 +30,20 @@ namespace op {
 
 NNVM_REGISTER_OP(Dropout)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool is_train) {
-        // Dropout is just passthrough during inference for all impls
-        if (!is_train)
-          return true;
+                                       [](const NodeAttrs& attrs, const bool is_train) {
+                                         // Dropout is a passthrough during inference for all impls
+                                         if (!is_train)
+                                           return true;
     
-        // cuDNN impl is compatible during training as well
-        const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
-        real_t pkeep = 1.0f - param.p;
-        bool cudnn_off = param.cudnn_off && param.cudnn_off.value();
-        bool cudnn_available = pkeep > 0 && !cudnn_off;
-        return MXNET_USE_CUDNN_DROPOUT && cudnn_available;
-      })
+                                         // cuDNN impl is compatible during training as well
+                                         const DropoutParam& param =
+                                             nnvm::get<DropoutParam>(attrs.parsed);
+                                         real_t pkeep = 1.0f - param.p;
+                                         bool cudnn_off =
+                                             param.cudnn_off && param.cudnn_off.value();
+                                         bool cudnn_available = pkeep > 0 && !cudnn_off;
+                                         return MXNET_USE_CUDNN_DROPOUT && cudnn_available;
+                                       })
     .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)
diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu
index ab5c6644501d..a217b6d4e0e7 100644
--- a/src/operator/numpy/linalg/np_eig.cu
+++ b/src/operator/numpy/linalg/np_eig.cu
@@ -30,18 +30,14 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_eig)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", EigOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigh)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", EighOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu
index 94a007d7a245..be00d8c991d9 100644
--- a/src/operator/numpy/linalg/np_eigvals.cu
+++ b/src/operator/numpy/linalg/np_eigvals.cu
@@ -30,18 +30,14 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_eigvals)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", EigvalsOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigvalsh)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", EigvalshOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu
index e88d717bd580..23a021d00ce5 100644
--- a/src/operator/numpy/linalg/np_norm_backward.cu
+++ b/src/operator/numpy/linalg/np_norm_backward.cu
@@ -27,10 +27,11 @@ namespace op {
 
 NNVM_REGISTER_OP(_backward_npi_norm)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs& attrs, const bool) {
-          const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
-          return param.axis.value().ndim() == 2;
-        })
+                                       [](const NodeAttrs& attrs, const bool) {
+                                         const NumpyNormParam& param =
+                                             nnvm::get<NumpyNormParam>(attrs.parsed);
+                                         return param.axis.value().ndim() == 2;
+                                       })
     .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu
index 26a87c580010..7399727324d0 100644
--- a/src/operator/numpy/linalg/np_norm_forward.cu
+++ b/src/operator/numpy/linalg/np_norm_forward.cu
@@ -27,10 +27,11 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_norm)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs& attrs, const bool) {
-          const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
-          return param.axis.value().ndim() == 2;
-        })
+                                       [](const NodeAttrs& attrs, const bool) {
+                                         const NumpyNormParam& param =
+                                             nnvm::get<NumpyNormParam>(attrs.parsed);
+                                         return param.axis.value().ndim() == 2;
+                                       })
     .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu
index 8e1bd57386cd..216e8ff2b839 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cu
+++ b/src/operator/numpy/np_boolean_mask_assign.cu
@@ -274,16 +274,12 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu
index 8d622d152bac..26a5f0178c0b 100644
--- a/src/operator/numpy/np_constraint_check.cu
+++ b/src/operator/numpy/np_constraint_check.cu
@@ -39,9 +39,7 @@ void GetReduceOutput<gpu>(mshadow::Stream<gpu>* s, const TBlob& output_blob, boo
 
 NNVM_REGISTER_OP(_npx_constraint_check)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", ConstraintCheckForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index ed9ab81f2066..27858988432d 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -54,16 +54,12 @@ NNVM_REGISTER_OP(_backward_np_column_stack)
 
 NNVM_REGISTER_OP(_npi_tril_indices)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", TrilindicesOpForward<gpu>);
 
 NNVM_REGISTER_OP(_npi_roll)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyRollCompute<gpu>);
 
 template <>
@@ -104,16 +100,12 @@ void NumpyFlipForwardImpl<gpu>(const OpContext& ctx,
 
 NNVM_REGISTER_OP(_npi_flip)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_npi_flip)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyFlipForward<gpu>);
 
 NNVM_REGISTER_OP(_npi_moveaxis).set_attr<FCompute>("FCompute<gpu>", NumpyMoveaxisCompute<gpu>);
@@ -125,17 +117,19 @@ NNVM_REGISTER_OP(_npi_rollaxis_backward)
 
 NNVM_REGISTER_OP(_npi_rot90)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs& attrs, const bool) {
-        const auto& param = nnvm::get<NumpyRot90Param>(attrs.parsed);
-        // Should track code in NumpyRot90Compute()
-        int real_k(param.k);
-        real_k = real_k % 4;
-        if (real_k < 0) {
-          real_k += 4;
-        }
-        // Avoid NumpyRot90ComputeFlipIml(), which uses mshadow::Copy()
-        return real_k != 2;
-      })
+                                       [](const NodeAttrs& attrs, const bool) {
+                                         const auto& param =
+                                             nnvm::get<NumpyRot90Param>(attrs.parsed);
+                                         // Should track code in NumpyRot90Compute()
+                                         int real_k(param.k);
+                                         real_k = real_k % 4;
+                                         if (real_k < 0) {
+                                           real_k += 4;
+                                         }
+                                         // Avoid NumpyRot90ComputeFlipIml(),
+                                         // which uses mshadow::Copy()
+                                         return real_k != 2;
+                                       })
     .set_attr<FCompute>("FCompute<gpu>", NumpyRot90Compute<gpu>);
 
 NNVM_REGISTER_OP(_npi_hsplit).set_attr<FCompute>("FCompute<gpu>", HSplitOpForward<gpu>);
diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu
index 8ae39bdf2c4c..597331e458ff 100644
--- a/src/operator/numpy/np_nonzero_op.cu
+++ b/src/operator/numpy/np_nonzero_op.cu
@@ -116,9 +116,7 @@ NNVM_REGISTER_OP(_npx_nonzero)
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-                                [](const NodeAttrs& attrs, const bool) {
-                                  return false;
-                                })
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FComputeEx>("FComputeEx<gpu>", NonzeroForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_pad_op.cu b/src/operator/numpy/np_pad_op.cu
index 97cb423b7bab..1b9f4f4d5a86 100644
--- a/src/operator/numpy/np_pad_op.cu
+++ b/src/operator/numpy/np_pad_op.cu
@@ -31,17 +31,13 @@ namespace op {
 NNVM_REGISTER_OP(_npi_pad)
     // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpForward
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs&, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyPadOpForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_npi_pad)
     // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpBackward
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-      [](const NodeAttrs&, const bool) {
-        return false;
-      })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyPadOpBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu
index 7ff67ad3c9c5..2dcc8294bb55 100644
--- a/src/operator/numpy/np_percentile_op.cu
+++ b/src/operator/numpy/np_percentile_op.cu
@@ -54,9 +54,7 @@ bool CheckInvalidInput(mshadow::Stream<gpu>* s,
 
 NNVM_REGISTER_OP(_npi_percentile)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-                                       [](const NodeAttrs&, const bool) {
-        return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyPercentileForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu
index 7a2068dad8a1..c908c9bb3372 100644
--- a/src/operator/numpy/random/np_exponential_op.cu
+++ b/src/operator/numpy/random/np_exponential_op.cu
@@ -28,15 +28,9 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_exponential)
-<<<<<<< HEAD
-    .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
-=======
 .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-    [](const NodeAttrs&, const bool) {
-      return false;
-    })
-.set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
->>>>>>> f4bcd48dd... [1.x][FEATURE] CUDA graphs support (#19142)
+                                   [](const NodeAttrs&, const bool) { return false; })
+    .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_exponential)
     .set_attr<FCompute>("FCompute<gpu>", ExponentialReparamBackward<gpu>);
diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu
index 8bfc61aad7ab..0191fd597ec6 100644
--- a/src/operator/numpy/random/np_gamma_op.cu
+++ b/src/operator/numpy/random/np_gamma_op.cu
@@ -30,9 +30,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_gamma)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyGammaForward<gpu, double>);
 
 NNVM_REGISTER_OP(_backward_gamma_sample).set_attr<FCompute>("FCompute<gpu>", NumpyGammaGrad<gpu>);
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
index ee77b79c6d91..575ad08b8184 100644
--- a/src/operator/numpy/random/np_multinomial_op.cu
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -42,9 +42,7 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) {
 
 NNVM_REGISTER_OP(_npi_multinomial)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyMultinomialForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu
index 3d310f82b20d..525a0e14a4e4 100644
--- a/src/operator/numpy/random/np_normal_op.cu
+++ b/src/operator/numpy/random/np_normal_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_normal)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_normal)
@@ -39,9 +37,7 @@ NNVM_REGISTER_OP(_backward_broadcast_normal)
 
 NNVM_REGISTER_OP(_npi_normal_n)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu
index 59900a0090e8..2948660f6e75 100644
--- a/src/operator/numpy/random/np_pareto_op.cu
+++ b/src/operator/numpy/random/np_pareto_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_pareto)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+        [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_pareto)
diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu
index f5bcb1000771..f7a6686769d0 100644
--- a/src/operator/numpy/random/np_power_op.cu
+++ b/src/operator/numpy/random/np_power_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_powerd)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyPowerForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu
index c17d6c2bdd79..f67a2fe36ad7 100644
--- a/src/operator/numpy/random/np_rayleigh_op.cu
+++ b/src/operator/numpy/random/np_rayleigh_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_rayleigh)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyRayleighForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_rayleigh)
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index baa36c6be587..5099301a1e4f 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -117,9 +117,7 @@ void ShapeComputeGPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(shape_array)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", ShapeComputeGPU);
 
 void SizeComputeGPU(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index f4ce8ebcfd8a..992054f860ef 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -959,9 +959,7 @@ NNVM_REGISTER_OP(one_hot).set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gp
 
 NNVM_REGISTER_OP(gather_nd)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);
 
 NNVM_REGISTER_OP(scatter_nd).set_attr<FCompute>("FCompute<gpu>", ScatterNDForward<gpu>);
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
index 8cae4a4d5be8..a32143a31d60 100644
--- a/src/operator/tensor/la_op.cu
+++ b/src/operator/tensor/la_op.cu
@@ -89,9 +89,7 @@ NNVM_REGISTER_OP(_backward_linalg_maketrian)
 
 NNVM_REGISTER_OP(_linalg_potri)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potri>);
 
 NNVM_REGISTER_OP(_backward_linalg_potri)
@@ -107,45 +105,35 @@ NNVM_REGISTER_OP(_linalg_det)
     // Incompatibility comes from allocs made in linalg_batch_getrf(), called by det::op()
     // see https://github.com/apache/incubator-mxnet/issues/19353
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetForward<gpu, 1, det>);
 
 NNVM_REGISTER_OP(_backward_linalg_det)
     // Incompatibility comes from allocs made in linalg_batch_getri(),
     // called by linalg_batch_det_backward_helper, called by det_backward::op()
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetBackward<gpu, 1, det_backward>);
 
 NNVM_REGISTER_OP(_linalg_slogdet)
     // Incompatibility comes from allocs made in linalg_batch_getrf(),
     // called by slogdet::op().
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetForward<gpu, 2, slogdet>);
 
 NNVM_REGISTER_OP(_backward_linalg_slogdet)
     // Incompatibility comes from allocs made in linalg_batch_getri(),
     // called by linalg_batch_det_backward_helper, called by slogdet_backward::op()
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpDetBackward<gpu, 2, slogdet_backward>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_linalg_potrf)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potrf>);
 
 NNVM_REGISTER_OP(_backward_linalg_potrf)
@@ -153,9 +141,7 @@ NNVM_REGISTER_OP(_backward_linalg_potrf)
 
 NNVM_REGISTER_OP(_linalg_gelqf)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 2, gelqf>);
 
 NNVM_REGISTER_OP(_backward_linalg_gelqf)
@@ -163,9 +149,7 @@ NNVM_REGISTER_OP(_backward_linalg_gelqf)
 
 NNVM_REGISTER_OP(_linalg_syevd)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LaOpForwSyevd<gpu, syevd>);
 
 NNVM_REGISTER_OP(_backward_linalg_syevd)
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index e26b767802a3..00007bd2e602 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -414,16 +414,12 @@ NNVM_REGISTER_OP(_backward_tile).set_attr<FCompute>("FCompute<gpu>", TileOpBackw
 
 NNVM_REGISTER_OP(reverse)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_reverse)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
 
 NNVM_REGISTER_OP(stack).set_attr<FCompute>("FCompute<gpu>", StackOpForward<gpu>);
@@ -442,17 +438,13 @@ NNVM_REGISTER_OP(space_to_depth).set_attr<FCompute>("FCompute<gpu>", SpaceToDept
 NNVM_REGISTER_OP(_split_v2)
     // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpForwardImpl
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", SplitOpForwardGPU);
 
 NNVM_REGISTER_OP(_split_v2_backward)
     // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpBackwardImpl
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", SplitOpBackward<gpu>);
 
 }  // namespace op

From a55892277504c4f824df984950eea7454ce78f4b Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 17 Feb 2022 15:29:25 -0800
Subject: [PATCH 09/30] More clang-tidy fixes

---
 include/mxnet/op_attr_types.h                  |  2 +-
 src/imperative/attach_op_execs_pass.cc         | 11 +++--------
 src/imperative/cuda_graphs.h                   |  9 +++++----
 src/imperative/exec_pass.h                     |  2 +-
 src/operator/nn/dropout.cu                     |  1 -
 src/operator/numpy/random/np_bernoulli_op.cu   |  4 +---
 src/operator/numpy/random/np_exponential_op.cu |  4 ++--
 src/operator/numpy/random/np_pareto_op.cu      |  2 +-
 src/operator/numpy/random/np_weibull_op.cu     |  4 +---
 9 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 73504bb2748b..c936d3e84afa 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -366,7 +366,7 @@ using FNeedCalibrateOutput = std::function<std::vector<int>(const NodeAttrs& att
  * to stay the same as long as the shape and type
  * of input stays the same.
  */
-using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;
+using FIsCUDAGraphsCompatible = std::function<bool(const NodeAttrs& attrs, const bool is_train)>;
 
 #endif
 
diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
index 51736c577e5b..732391fdd747 100644
--- a/src/imperative/attach_op_execs_pass.cc
+++ b/src/imperative/attach_op_execs_pass.cc
@@ -257,9 +257,7 @@ class FComputeExExecutor : public OpExecutor {
                               DispatchMode dispatch_mode,
                               FComputeEx fcompute,
                               ExecType exec_type)
-      : OpExecutor(attrs, dispatch_mode),
-        fcompute_(std::move(fcompute)),
-        exec_type_(exec_type) {}
+      : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
 
  private:
   FComputeEx fcompute_;
@@ -315,11 +313,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         common::GetFCompute<FStatefulComputeEx>(op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
-                                                           dispatch_modes[i],
-                                                           state,
-                                                           fcompute_ex,
-                                                           exec_type);
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(
+          inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute =
           common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", vctx[i]);
diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index a264a6edbfc4..9c64972e9f2c 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -112,7 +112,8 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
         case cudaMemcpyDeviceToDevice:
           ss << "Device->Device ";
           break;
-        default: break;
+        default:
+          break;
       }
       ss << "Memcpy";
     } break;
@@ -178,7 +179,7 @@ class CudaGraphsSubSegExec {
                        int from_op_idx,
                        int num_ops,
                        bool ops_are_cuda_graph_compatible = true)
-    : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) {
+      : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) {
     if (ops_are_cuda_graph_compatible) {
       MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
       MakeGraphExec();
@@ -308,7 +309,7 @@ class CudaGraphsExec {
  public:
   CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
                  bool is_gpu,
-                 const char *opr_names)
+                 const char* opr_names)
       : verbose_(false), is_enabled_(false) {
     opr_names_ = opr_names ? std::string(opr_names) : std::string();
     if (is_gpu) {
@@ -448,7 +449,7 @@ class CudaGraphsExec {
             if (verbose_)
               LOG(INFO) << "Capturing CUDA graph of op sub segment[" << first_op_idx << ":"
                         << (first_op_idx + num_good_ops - 1) << "]"
-                        << " of op segment "  << opr_names_;
+                        << " of op segment " << opr_names_;
             CudaGraphsSubSegExec good_opseg(
                 exec_list, rctx, is_gpu, verbose_, first_op_idx, num_good_ops);
             CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation";
diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h
index 1b9ef071f676..02fa967a19b7 100644
--- a/src/imperative/exec_pass.h
+++ b/src/imperative/exec_pass.h
@@ -111,7 +111,7 @@ class OpExecutor {
    *  This function call does not synchronize the stream.
    * \param rctx The runtime context passed in by environment.
    */
-  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> >& execs,
+  static void RunAll(const std::vector<std::shared_ptr<OpExecutor>>& execs,
                      RunContext rctx,
                      bool is_gpu) {
     for (auto& exec : execs)
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 98cddbc97634..6b4bfc8aebd2 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -34,7 +34,6 @@ NNVM_REGISTER_OP(Dropout)
                                          // Dropout is a passthrough during inference for all impls
                                          if (!is_train)
                                            return true;
-    
                                          // cuDNN impl is compatible during training as well
                                          const DropoutParam& param =
                                              nnvm::get<DropoutParam>(attrs.parsed);
diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu
index 0cf9bd95ab7c..eee89c1ea8d4 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cu
+++ b/src/operator/numpy/random/np_bernoulli_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_bernoulli)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyBernoulliForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu
index c908c9bb3372..8ad738639eae 100644
--- a/src/operator/numpy/random/np_exponential_op.cu
+++ b/src/operator/numpy/random/np_exponential_op.cu
@@ -28,8 +28,8 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_exponential)
-.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-                                   [](const NodeAttrs&, const bool) { return false; })
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_exponential)
diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu
index 2948660f6e75..82fcd1f4d066 100644
--- a/src/operator/numpy/random/np_pareto_op.cu
+++ b/src/operator/numpy/random/np_pareto_op.cu
@@ -29,7 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_pareto)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) { return false; })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_pareto)
diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu
index 62c0a564c060..4495bab39206 100644
--- a/src/operator/numpy/random/np_weibull_op.cu
+++ b/src/operator/numpy/random/np_weibull_op.cu
@@ -29,9 +29,7 @@ namespace op {
 
 NNVM_REGISTER_OP(_npi_weibull)
     .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
-        [](const NodeAttrs&, const bool) {
-          return false;
-        })
+                                       [](const NodeAttrs&, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", NumpyWeibullForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_weibull)

From eaa7fc7eb70043d8575bc6235f9abb6dfddc7582 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 17 Feb 2022 17:50:49 -0800
Subject: [PATCH 10/30] Avoid CUDA_CALL(e): improper macro expansion

---
 src/imperative/cuda_graphs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 9c64972e9f2c..8a8a06fa6b6e 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -196,9 +196,9 @@ class CudaGraphsSubSegExec {
 
     cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
     cudaGraphNode_t error_node;
-    cudaError_t e =
+    cudaError_t err =
         cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result);
-    switch (e) {
+    switch (err) {
       case cudaErrorGraphExecUpdateFailure:
         MakeGraphExec();
         break;
@@ -207,7 +207,7 @@ class CudaGraphsSubSegExec {
         break;
       default:
         // Respond normally to unusual cudaGraphExecUpdate() ret vals
-        CUDA_CALL(e);
+        CUDA_CALL(err);
     }
   }
 

From c44cfc64158a24faaa6f4de232085fda00086187 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 17 Feb 2022 19:39:17 -0800
Subject: [PATCH 11/30] Add compile guard to Dropout's FIsCUDAGraphsCompatible
 def

---
 src/operator/nn/dropout.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 6b4bfc8aebd2..414b82edcc65 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -34,6 +34,7 @@ NNVM_REGISTER_OP(Dropout)
                                          // Dropout is a passthrough during inference for all impls
                                          if (!is_train)
                                            return true;
+#if MXNET_USE_CUDNN_DROPOUT
                                          // cuDNN impl is compatible during training as well
                                          const DropoutParam& param =
                                              nnvm::get<DropoutParam>(attrs.parsed);
@@ -41,7 +42,10 @@ NNVM_REGISTER_OP(Dropout)
                                          bool cudnn_off =
                                              param.cudnn_off && param.cudnn_off.value();
                                          bool cudnn_available = pkeep > 0 && !cudnn_off;
-                                         return MXNET_USE_CUDNN_DROPOUT && cudnn_available;
+                                         return cudnn_available;
+#else
+                                         return false;
+#endif  // MXNET_USE_CUDNN_DROPOUT
                                        })
     .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 

From 5a2f847558a7f55790f1ad1fb5ee930b4ad1a3a9 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 18 Feb 2022 00:37:34 -0800
Subject: [PATCH 12/30] Temporarily add '-s' to pytest serial tests

---
 ci/docker/runtime_functions.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 05f80032cd15..69453c39e538 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -801,7 +801,8 @@ cd_unittest_ubuntu() {
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
     OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
-    pytest -m 'serial' --durations=50 --verbose tests/python/unittest
+    # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error
+    pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then

From 3b58b49e3d95b815f0ea6b3dd7849101630ac04e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sat, 26 Feb 2022 23:12:55 -0800
Subject: [PATCH 13/30] Fix DropoutOp.dropout_passthrough_ handling for CUDA
 Graphs

---
 src/operator/nn/dropout-inl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 18f94cffd25b..0baa8e40c397 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -437,7 +437,6 @@ class DropoutOp {
     using namespace mshadow::expr;
     Stream<xpu>* s = ctx.get_stream<xpu>();
     if (!this->dropout_passthrough_) {
-      this->dropout_passthrough_ = true;
       const TBlob& gdata         = in_grad[dropout::kData];
       const TBlob& grad          = out_grad[dropout::kOut];
       const TBlob& mask          = out_data[dropout::kMask];

From 0d620833d02a41fb218cd2526de903db48fa2877 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 27 Feb 2022 14:34:27 -0800
Subject: [PATCH 14/30] Adapt test_gluon_gpu.py::test_cuda_graphs for gluon2.0

---
 tests/python/gpu/test_gluon_gpu.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index af6805c9cd5c..20a7f26e8686 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -596,21 +596,22 @@ def test_cudnn_dropout_reproducibility():
 
     assert_almost_equal(a.grad, b.grad)
 
+@mx.util.use_np
 def test_cuda_graphs():
     class GraphTester(gluon.HybridBlock):
         def __init__(self, function_to_test, **kwargs):
             super(GraphTester, self).__init__(**kwargs)
             self.f = function_to_test()
 
-        def hybrid_forward(self, F, *args):
+        def forward(self, *args):
             # We need to isolate the operation to be fully inside the graph
             # in order for graphs usage to be possible
-            copied_args = [F.identity(a) for a in args]
+            copied_args = [mx.np.copy(a) for a in args]
             outputs = self.f(*copied_args)
             if isinstance(outputs, (list, tuple)):
-                return [F.identity(o) for o in outputs]
+                return [mx.np.copy(o) for o in outputs]
             else:
-                return F.identity(outputs)
+                return mx.np.copy(outputs)
 
     class TestDesc:
         def __init__(self, name, f, num_inputs=1, input_dim=4):
@@ -620,8 +621,8 @@ def __init__(self, name, f, num_inputs=1, input_dim=4):
             self.input_dim = input_dim
 
         def generate_inputs(self):
-            shape = tuple(np.random.randint(4, 11, size=self.input_dim))
-            ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)]
+            shape = tuple(_np.random.randint(4, 11, size=self.input_dim))
+            ret = [mx.np.random.uniform(size=shape) for _ in range(self.num_inputs)]
             for r in ret:
                 r.attach_grad()
             return ret
@@ -654,6 +655,7 @@ def generate_inputs(self):
 
     with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
                       'MXNET_USE_FUSION': '0'}):
+        device = mx.gpu(0)
         for test_desc in tested_ops:
             print("Testing ", test_desc.name)
             inputs = test_desc.generate_inputs()
@@ -665,8 +667,8 @@ def generate_inputs(self):
             netg = GraphTester(test_desc.f)
 
             # initialize parameters
-            net.initialize()
-            netg.initialize()
+            net.initialize(device=device)
+            netg.initialize(device=device)
 
             net(*inputs)
 
@@ -680,7 +682,7 @@ def generate_inputs(self):
                 for _ in range(N):
                     assert_almost_equal(net(*inputs), netg(*inputsg))
 
-            mx.nd.waitall()
+            mx.npx.waitall()
             print("Testing training mode")
             for _ in range(N):
                 with random_seed(seed):
@@ -701,4 +703,4 @@ def generate_inputs(self):
                     assert_almost_equal(p1.data(), p2.data())
                     if p1.grad_req != 'null':
                         assert_almost_equal(p1.grad(), p2.grad())
-            mx.nd.waitall()
+            mx.npx.waitall()

From 3591f509f2f28e7a6e3166a25bf65fcce17631e3 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 27 Feb 2022 15:27:18 -0800
Subject: [PATCH 15/30] Create CUDA Graph 'dot' files if
 MXNET_CUDA_GRAPHS_DBG_FILE=<file_prefix>

---
 docs/static_site/src/pages/api/faq/env_var.md |  6 ++
 src/imperative/cuda_graphs.h                  | 59 ++++++++++++++-----
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index 1a4421d2e50f..8e12b48aac76 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -180,6 +180,12 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
   - Values: Int ```(default=0)```
   - The maximum number of log messages generated by CUDA graphs executor.
+* MXNET_CUDA_GRAPHS_DBG_FILE
+  - Values: String ```(default='', to indicate no debug dot files should be created)```
+  - The file prefix for '.dot' files for each graph created.  Full path is <prefix>-devN-{trn,inf}.<graphId>.dot .
+* MXNET_CUDA_GRAPHS_DBG_FILE_FLAGS
+  - Values: Int ```(default=<most verbose setting- includes all info>)```
+  - A bitmask to enable various types of info in the debug '.dot' files.  See cudaGraphDebugDotFlags in the CUDA runtime API doc for details.
 
 ## Control the Data Communication
 
diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 8a8a06fa6b6e..5c954011e4d2 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -30,6 +30,7 @@
 #include <string>
 #include <map>
 #include <set>
+#include <sstream>
 
 #include "./exec_pass.h"
 #include "../common/cuda/utils.h"
@@ -179,10 +180,11 @@ class CudaGraphsSubSegExec {
                        int from_op_idx,
                        int num_ops,
                        bool ops_are_cuda_graph_compatible = true)
-      : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) {
+      : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr),
+        graph_exec_id_(0) {
     if (ops_are_cuda_graph_compatible) {
       MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
-      MakeGraphExec();
+      MakeGraphExec(exec_list, rctx);
     }
   }
 
@@ -200,7 +202,7 @@ class CudaGraphsSubSegExec {
         cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result);
     switch (err) {
       case cudaErrorGraphExecUpdateFailure:
-        MakeGraphExec();
+        MakeGraphExec(exec_list, rctx);
         break;
       case cudaSuccess:
         CHECK_EQ(update_result, cudaGraphExecUpdateSuccess);
@@ -229,6 +231,12 @@ class CudaGraphsSubSegExec {
     return graph_exec_ != nullptr;
   }
 
+  int NumGraphNodes() {
+    size_t numNodes;
+    CUDA_CALL(cudaGraphGetNodes(graph_.get(), static_cast<cudaGraphNode_t*>(nullptr), &numNodes));
+    return numNodes;
+  }
+
  private:
   void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
                  const RunContext& rctx,
@@ -259,7 +267,9 @@ class CudaGraphsSubSegExec {
     }
   }
 
-  void MakeGraphExec() {
+  void MakeGraphExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
+                     const RunContext& rctx) {
+    // Note that this routine is not invoked when a graph executor is merely updated.
     cudaGraphExec_t cuda_graph_exec;
     cudaGraphNode_t error_node;
     char log_buffer[1000];
@@ -268,14 +278,34 @@ class CudaGraphsSubSegExec {
     graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
 
     // At this point we have a CUDA Graph executor
-    static int num_graph_creations_logged = 0;
-    static int max_log_entries            = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
-    if (num_graph_creations_logged < max_log_entries) {
-      num_graph_creations_logged++;
-      LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
-      if (num_graph_creations_logged == max_log_entries)
+    static int num_graph_creations = 0;
+    graph_exec_id_ = num_graph_creations++;
+
+    static size_t max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
+    if (graph_exec_id_ < max_log_entries) {
+      LOG(INFO) << "Created CUDA graph " << graph_exec_id_;
+      if (num_graph_creations == max_log_entries)
         LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
     }
+    // Create a .dot file for graph visualization if requested
+    static std::string dotfile_base = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE", std::string());
+    if (dotfile_base.size() > 0) {
+#if CUDA_VERSION >= 11030
+      static int dotfile_flags = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE_FLAGS",
+                                              static_cast<int>(cudaGraphDebugDotFlagsVerbose));
+      std::ostringstream filename;
+      const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
+      int dev_id = rctx.ctx.dev_id;
+      filename << dotfile_base << "-" << "dev" << dev_id << "-" << (is_train ? "trn" : "inf")
+               << "-" << graph_exec_id_ << ".dot";
+      CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags));
+#else
+      static bool dot_file_unsupported = []() {
+        LOG(INFO) << "MXNET_CUDA_GRAPHS_DBG_FILE setting ignored- requires CUDA version >= 11.3";
+        return true;
+      }();
+#endif  // CUDA_VERSION >= 11030
+    }
   }
 
   int from_op_idx_;
@@ -284,6 +314,7 @@ class CudaGraphsSubSegExec {
   using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type;
   std::shared_ptr<cudaGraphStruct_t> graph_;
   std::shared_ptr<cudaGraphExecStruct_t> graph_exec_;
+  size_t graph_exec_id_;
 };
 
 // The CudaGraph executor and associated Tempspace ptrs for which it is valid.
@@ -404,13 +435,13 @@ class CudaGraphsExec {
       int runnable_execs = 0;
       int bypassed_ops   = 0;
       for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
-        if (subseg_exec.IsRunnable())
+        if (subseg_exec.IsRunnable()) {
+          LOG(INFO) << "Launching captured graph with " << subseg_exec.NumGraphNodes() << " nodes.";
           runnable_execs++;
-        else
+        } else {
           bypassed_ops++;
+        }
       }
-      LOG(INFO) << "Launching " << runnable_execs << " captured CUDA graph(s) for op segment "
-                << opr_names_;
       if (bypassed_ops > 0)
         LOG(INFO) << "    (bypassing " << bypassed_ops << " un-capturable ops)";
     }

From 0e105ec6d189abe9b9010cb78cf5d22fa827d9b9 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 27 Feb 2022 15:36:38 -0800
Subject: [PATCH 16/30] Fix clang-tidy

---
 src/imperative/cuda_graphs.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 5c954011e4d2..25fd003b7eaf 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -173,14 +173,17 @@ class CudaGraphExecDeleter {
 // characterized by a starting index in the OpExecutor list and a number of ops.
 class CudaGraphsSubSegExec {
  public:
-  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
                        const RunContext& rctx,
                        bool is_gpu,
                        bool verbose,
                        int from_op_idx,
                        int num_ops,
                        bool ops_are_cuda_graph_compatible = true)
-      : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr),
+      : from_op_idx_(from_op_idx),
+        num_ops_(num_ops),
+        graph_(nullptr),
+        graph_exec_(nullptr),
         graph_exec_id_(0) {
     if (ops_are_cuda_graph_compatible) {
       MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
@@ -188,7 +191,7 @@ class CudaGraphsSubSegExec {
     }
   }
 
-  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  void Update(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
               const RunContext& rctx,
               bool is_gpu,
               bool verbose) {
@@ -213,7 +216,7 @@ class CudaGraphsSubSegExec {
     }
   }
 
-  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
                  const RunContext& rctx,
                  bool is_gpu) {
     if (IsRunnable()) {
@@ -238,7 +241,7 @@ class CudaGraphsSubSegExec {
   }
 
  private:
-  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
                  const RunContext& rctx,
                  bool is_gpu,
                  bool verbose,
@@ -279,7 +282,7 @@ class CudaGraphsSubSegExec {
 
     // At this point we have a CUDA Graph executor
     static int num_graph_creations = 0;
-    graph_exec_id_ = num_graph_creations++;
+    graph_exec_id_                 = num_graph_creations++;
 
     static size_t max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
     if (graph_exec_id_ < max_log_entries) {
@@ -295,9 +298,10 @@ class CudaGraphsSubSegExec {
                                               static_cast<int>(cudaGraphDebugDotFlagsVerbose));
       std::ostringstream filename;
       const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
-      int dev_id = rctx.ctx.dev_id;
-      filename << dotfile_base << "-" << "dev" << dev_id << "-" << (is_train ? "trn" : "inf")
-               << "-" << graph_exec_id_ << ".dot";
+      int dev_id          = rctx.ctx.dev_id;
+      filename << dotfile_base << "-"
+               << "dev" << dev_id << "-" << (is_train ? "trn" : "inf") << "-" << graph_exec_id_
+               << ".dot";
       CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags));
 #else
       static bool dot_file_unsupported = []() {
@@ -338,7 +342,7 @@ using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>;
 
 class CudaGraphsExec {
  public:
-  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
                  bool is_gpu,
                  const char* opr_names)
       : verbose_(false), is_enabled_(false) {
@@ -350,7 +354,7 @@ class CudaGraphsExec {
     }
   }
 
-  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
               const RunContext& rctx,
               bool is_gpu) {
     // If this a CPU op or CUDA Graphs use isn't possible, run normally and return
@@ -546,7 +550,7 @@ class CudaGraphsExec {
   }
 
   // Determine Tempspaces used by ops.  Other resource uses disable CUDA Graphs.
-  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list) {
+  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list) {
     // Gather info about the ops use of TempSpace.
     if (is_enabled_) {
       std::set<Resource*> tempspaces_set;

From d8d65c9ff094bd8a01e85ec36e98d63a1c69f8f0 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 27 Feb 2022 15:40:19 -0800
Subject: [PATCH 17/30] Fix more clang-tidy

---
 src/imperative/cuda_graphs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 25fd003b7eaf..f7c1663c4efa 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -457,7 +457,7 @@ class CudaGraphsExec {
   // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx).  If such a graph
   // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a
   // number of smaller graphs that avoid those ops with the memcpys.
-  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> >& exec_list,
+  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor>>& exec_list,
                                const RunContext& rctx,
                                bool is_gpu,
                                size_t from_op_idx,

From 26182fb501eb5bb40df2c76ff53aab8ab55d3e0e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 16 Feb 2022 22:40:35 -0800
Subject: [PATCH 18/30] Skip test_np_standard_binary_funcs test of 0-dim array
 broadcast

---
 tests/python/unittest/test_numpy_op.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 8008c053cd2b..806fb130ab27 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -11712,8 +11712,12 @@ def array_values(low, high, shape):
     ((3, 1), (3, 0)),
     ((0, 2), (1, 2)),
     ((2, 3, 4), (3, 1)),
-    ((2, 3), ()),
-    ((), (2, 3))
+# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays.
+# See https://github.com/apache/incubator-mxnet/issues/20898.
+#    ((2, 3), ()),
+#    ((), (2, 3))
+    ((2, 3), (1,)),
+    ((1,), (2, 3))
 ])
 def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape):
     class TestStandardBinary(HybridBlock):

From 6cc8ab8d37b3eb1c1849e4b625f2a93c5bb2756e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 20 Feb 2022 20:44:57 -0800
Subject: [PATCH 19/30] Improve test_rnn_layers_fp{16,32} invocation

---
 tests/python/unittest/test_gluon_rnn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 2911f9165244..7f86034fe0b8 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m
 
 
 @mx.util.use_np
-def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
+def run_rnn_layers(dtype, dtype2):
+    device = default_device()
 
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device)
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype), device=device)
@@ -673,10 +674,12 @@ def test_rnn_layers_fp32():
     run_rnn_layers('float32', 'float32')
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
 @pytest.mark.serial
 def test_rnn_layers_fp16():
-    run_rnn_layers('float16', 'float32', mx.gpu())
+    # Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf
+    if default_device().device_type == 'cpu':
+        pytest.skip('RNN FP16 only implemented for GPU for now')
+    run_rnn_layers('float16', 'float32')
 
 
 def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4):

From d06b1395a4ab37db2f9aef7155c232def223a5f6 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 21 Feb 2022 10:42:28 -0800
Subject: [PATCH 20/30] Run test_rnn_layers_fp32 only when cuDNN is present

---
 tests/python/unittest/test_gluon_rnn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 7f86034fe0b8..c65a82ca539b 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -669,6 +669,7 @@ def run_rnn_layers(dtype, dtype2):
         out.backward()
         out = out.asnumpy()
 
+@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 @pytest.mark.serial
 def test_rnn_layers_fp32():
     run_rnn_layers('float32', 'float32')

From c5198c2914e686018a8b1ee0848f0dc3772e3133 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 21 Feb 2022 16:58:32 -0800
Subject: [PATCH 21/30] Fix potential out-of-bounds write in count_sketch.cu

---
 src/operator/contrib/count_sketch.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
index 24ca7970e064..bb16695caa74 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads,
   // only calculate gradient regarding x
   // can also calculate gradient regarding s if needed
   const int index    = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= nthreads) {
+    return;
+  }
   const int i_indim  = index % in_dim;
   const int i_sample = index / in_dim;
   const int i_outdim = i_sample * out_dim + h[i_indim];

From e013a85ea599fa761cb98762f11feab6e7d74049 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 1 Mar 2022 08:56:49 -0800
Subject: [PATCH 22/30] Add temp output to debug centos crash

---
 tests/python/gpu/test_gluon_gpu.py | 95 +++++++++++++++---------------
 1 file changed, 49 insertions(+), 46 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 20a7f26e8686..46e937377bc6 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -597,7 +597,8 @@ def test_cudnn_dropout_reproducibility():
     assert_almost_equal(a.grad, b.grad)
 
 @mx.util.use_np
-def test_cuda_graphs():
+@pytest.mark.serial
+def test_cuda_graphs(capsys):
     class GraphTester(gluon.HybridBlock):
         def __init__(self, function_to_test, **kwargs):
             super(GraphTester, self).__init__(**kwargs)
@@ -653,54 +654,56 @@ def generate_inputs(self):
 
     N = 10
 
-    with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
-                      'MXNET_USE_FUSION': '0'}):
-        device = mx.gpu(0)
-        for test_desc in tested_ops:
-            print("Testing ", test_desc.name)
-            inputs = test_desc.generate_inputs()
-            inputsg = [i.copy() for i in inputs]
-            for i in inputsg:
-                i.attach_grad()
-            seed = random.randint(0, 10000)
-            net = GraphTester(test_desc.f)
-            netg = GraphTester(test_desc.f)
-
-            # initialize parameters
-            net.initialize(device=device)
-            netg.initialize(device=device)
-
-            net(*inputs)
+    with capsys.disabled():
+        with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
+                          'MXNET_CUDA_GRAPHS_VERBOSE': '1',
+                          'MXNET_USE_FUSION': '0'}):
+            device = mx.gpu(0)
+            for test_desc in tested_ops:
+                sys.stdout.write('Testing {}\n'.format( test_desc.name))
+                inputs = test_desc.generate_inputs()
+                inputsg = [i.copy() for i in inputs]
+                for i in inputsg:
+                    i.attach_grad()
+                seed = random.randint(0, 10000)
+                net = GraphTester(test_desc.f)
+                netg = GraphTester(test_desc.f)
+
+                # initialize parameters
+                net.initialize(device=device)
+                netg.initialize(device=device)
+
+                net(*inputs)
 
-            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-                p2.set_data(p1.data())
-
-            netg.hybridize(static_alloc=True, static_shape=True)
-
-            print("Testing inference mode")
-            with random_seed(seed):
-                for _ in range(N):
-                    assert_almost_equal(net(*inputs), netg(*inputsg))
+                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                    p2.set_data(p1.data())
 
-            mx.npx.waitall()
-            print("Testing training mode")
-            for _ in range(N):
-                with random_seed(seed):
-                    with mx.autograd.record():
-                        out = net(*inputs)
-                    out.backward()
+                netg.hybridize(static_alloc=True, static_shape=True)
 
+                print("    Testing inference mode")
                 with random_seed(seed):
-                    with mx.autograd.record():
-                        outg = netg(*inputsg)
-                    outg.backward()
+                    for _ in range(N):
+                        assert_almost_equal(net(*inputs), netg(*inputsg))
 
-                assert_almost_equal(out, outg)
-                for i, ig in zip(inputs, inputsg):
-                    assert_almost_equal(i.grad, ig.grad)
-
-                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-                    assert_almost_equal(p1.data(), p2.data())
-                    if p1.grad_req != 'null':
-                        assert_almost_equal(p1.grad(), p2.grad())
+                mx.npx.waitall()
+                print("    Testing training mode")
+                for _ in range(N):
+                    with random_seed(seed):
+                        with mx.autograd.record():
+                            out = net(*inputs)
+                        out.backward()
+
+                    with random_seed(seed):
+                        with mx.autograd.record():
+                            outg = netg(*inputsg)
+                        outg.backward()
+
+                    assert_almost_equal(out, outg)
+                    for i, ig in zip(inputs, inputsg):
+                        assert_almost_equal(i.grad, ig.grad)
+
+                    for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                        assert_almost_equal(p1.data(), p2.data())
+                        if p1.grad_req != 'null':
+                            assert_almost_equal(p1.grad(), p2.grad())
             mx.npx.waitall()

From 7651c97310481d485b9e320b1bd2811d14b2692c Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Mar 2022 22:10:28 -0800
Subject: [PATCH 23/30] Mark InstanceNorm and LeakyRELU as not CUDA Graphs
 compatible

---
 src/operator/instance_norm.cu | 7 ++++++-
 src/operator/leaky_relu.cu    | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/operator/instance_norm.cu b/src/operator/instance_norm.cu
index ca45dbbff386..ce11fbf3200d 100644
--- a/src/operator/instance_norm.cu
+++ b/src/operator/instance_norm.cu
@@ -28,9 +28,14 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(InstanceNorm).set_attr<FCompute>("FCompute<gpu>", InstanceNormForward<gpu>);
+NNVM_REGISTER_OP(InstanceNorm)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
+    .set_attr<FCompute>("FCompute<gpu>", InstanceNormForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_instance_norm)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", InstanceNormBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/leaky_relu.cu b/src/operator/leaky_relu.cu
index d461949ed225..82ec59bfe907 100644
--- a/src/operator/leaky_relu.cu
+++ b/src/operator/leaky_relu.cu
@@ -28,9 +28,14 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(LeakyReLU).set_attr<FCompute>("FCompute<gpu>", LeakyReLUCompute<gpu>);
+NNVM_REGISTER_OP(LeakyReLU)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
+    .set_attr<FCompute>("FCompute<gpu>", LeakyReLUCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_LeakyReLU)
+    .set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                       [](const NodeAttrs& attrs, const bool) { return false; })
     .set_attr<FCompute>("FCompute<gpu>", LeakyReLUGradCompute<gpu>);
 
 }  // namespace op

From e70402220237a5b82c74c0a0519a94d9b6cf3921 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Mar 2022 22:13:33 -0800
Subject: [PATCH 24/30] Ops calling FStatefulCompute* are not CUDA Graphs
 compatible by default

---
 src/imperative/cuda_graphs.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index f7c1663c4efa..d087acc6888f 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -510,16 +510,21 @@ class CudaGraphsExec {
 
   // Is the Op OK to make part of a CUDA Graph?
   bool OpOK(const std::shared_ptr<exec::OpExecutor>& exec) {
-    static auto& fstateful        = Op::GetAttr<FCreateOpState>("FCreateOpState");
-    static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
-    static auto& fcompute_ex      = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
-    const auto& attrs             = exec->attrs;
+    static auto& fgraphcompatible =
+        Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    static auto& fstateful           = Op::GetAttr<FCreateOpState>("FCreateOpState");
+    static auto& fcompute_ex         = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
+    static auto& fstatefulcompute    = Op::GetAttr<FStatefulCompute>("FStatefulCompute<gpu>");
+    static auto& fstatefulcompute_ex = Op::GetAttr<FStatefulComputeEx>("FStatefulComputeEx<gpu>");
+    const auto& attrs                = exec->attrs;
     if (attrs.op != nullptr) {
       const auto f = fgraphcompatible.get(attrs.op, nullptr);
       if (f != nullptr) {
         return f(attrs, exec->op_ctx.is_train);
       }
-      if (fstateful.get(attrs.op, nullptr) != nullptr) {
+      if (fstateful.get(attrs.op, nullptr) != nullptr ||
+          fstatefulcompute.get(attrs.op, nullptr) != nullptr ||
+          fstatefulcompute_ex.get(attrs.op, nullptr) != nullptr) {
         if (verbose_) {
           LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph.";
         }

From da59cff1df9095809940560d66bf4c33593bdfc8 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Mar 2022 23:34:56 -0800
Subject: [PATCH 25/30] Fix clang-tidy

---
 src/imperative/cuda_graphs.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index d087acc6888f..ee2ba00aea84 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -510,11 +510,10 @@ class CudaGraphsExec {
 
   // Is the Op OK to make part of a CUDA Graph?
   bool OpOK(const std::shared_ptr<exec::OpExecutor>& exec) {
-    static auto& fgraphcompatible =
-        Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
-    static auto& fstateful           = Op::GetAttr<FCreateOpState>("FCreateOpState");
-    static auto& fcompute_ex         = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
-    static auto& fstatefulcompute    = Op::GetAttr<FStatefulCompute>("FStatefulCompute<gpu>");
+    static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    static auto& fstateful        = Op::GetAttr<FCreateOpState>("FCreateOpState");
+    static auto& fcompute_ex      = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
+    static auto& fstatefulcompute = Op::GetAttr<FStatefulCompute>("FStatefulCompute<gpu>");
     static auto& fstatefulcompute_ex = Op::GetAttr<FStatefulComputeEx>("FStatefulComputeEx<gpu>");
     const auto& attrs                = exec->attrs;
     if (attrs.op != nullptr) {

From 45bb7b8403bc05b6f40603e78ce0c38f181e25d9 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 9 Mar 2022 16:46:45 -0800
Subject: [PATCH 26/30] Revert "Add temp output to debug centos crash"

This reverts commit e013a85ea599fa761cb98762f11feab6e7d74049.
---
 tests/python/gpu/test_gluon_gpu.py | 95 +++++++++++++++---------------
 1 file changed, 46 insertions(+), 49 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 46e937377bc6..20a7f26e8686 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -597,8 +597,7 @@ def test_cudnn_dropout_reproducibility():
     assert_almost_equal(a.grad, b.grad)
 
 @mx.util.use_np
-@pytest.mark.serial
-def test_cuda_graphs(capsys):
+def test_cuda_graphs():
     class GraphTester(gluon.HybridBlock):
         def __init__(self, function_to_test, **kwargs):
             super(GraphTester, self).__init__(**kwargs)
@@ -654,56 +653,54 @@ def generate_inputs(self):
 
     N = 10
 
-    with capsys.disabled():
-        with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
-                          'MXNET_CUDA_GRAPHS_VERBOSE': '1',
-                          'MXNET_USE_FUSION': '0'}):
-            device = mx.gpu(0)
-            for test_desc in tested_ops:
-                sys.stdout.write('Testing {}\n'.format( test_desc.name))
-                inputs = test_desc.generate_inputs()
-                inputsg = [i.copy() for i in inputs]
-                for i in inputsg:
-                    i.attach_grad()
-                seed = random.randint(0, 10000)
-                net = GraphTester(test_desc.f)
-                netg = GraphTester(test_desc.f)
-
-                # initialize parameters
-                net.initialize(device=device)
-                netg.initialize(device=device)
-
-                net(*inputs)
+    with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
+                      'MXNET_USE_FUSION': '0'}):
+        device = mx.gpu(0)
+        for test_desc in tested_ops:
+            print("Testing ", test_desc.name)
+            inputs = test_desc.generate_inputs()
+            inputsg = [i.copy() for i in inputs]
+            for i in inputsg:
+                i.attach_grad()
+            seed = random.randint(0, 10000)
+            net = GraphTester(test_desc.f)
+            netg = GraphTester(test_desc.f)
 
-                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-                    p2.set_data(p1.data())
+            # initialize parameters
+            net.initialize(device=device)
+            netg.initialize(device=device)
 
-                netg.hybridize(static_alloc=True, static_shape=True)
+            net(*inputs)
 
-                print("    Testing inference mode")
-                with random_seed(seed):
-                    for _ in range(N):
-                        assert_almost_equal(net(*inputs), netg(*inputsg))
+            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                p2.set_data(p1.data())
 
-                mx.npx.waitall()
-                print("    Testing training mode")
+            netg.hybridize(static_alloc=True, static_shape=True)
+
+            print("Testing inference mode")
+            with random_seed(seed):
                 for _ in range(N):
-                    with random_seed(seed):
-                        with mx.autograd.record():
-                            out = net(*inputs)
-                        out.backward()
-
-                    with random_seed(seed):
-                        with mx.autograd.record():
-                            outg = netg(*inputsg)
-                        outg.backward()
-
-                    assert_almost_equal(out, outg)
-                    for i, ig in zip(inputs, inputsg):
-                        assert_almost_equal(i.grad, ig.grad)
-
-                    for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-                        assert_almost_equal(p1.data(), p2.data())
-                        if p1.grad_req != 'null':
-                            assert_almost_equal(p1.grad(), p2.grad())
+                    assert_almost_equal(net(*inputs), netg(*inputsg))
+
+            mx.npx.waitall()
+            print("Testing training mode")
+            for _ in range(N):
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        out = net(*inputs)
+                    out.backward()
+
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        outg = netg(*inputsg)
+                    outg.backward()
+
+                assert_almost_equal(out, outg)
+                for i, ig in zip(inputs, inputsg):
+                    assert_almost_equal(i.grad, ig.grad)
+
+                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                    assert_almost_equal(p1.data(), p2.data())
+                    if p1.grad_req != 'null':
+                        assert_almost_equal(p1.grad(), p2.grad())
             mx.npx.waitall()

From b7ecce28254fa74692fb4269752d86886d0b33c5 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 9 Mar 2022 16:58:20 -0800
Subject: [PATCH 27/30] Quiet 'unused variable' compilation warning

---
 src/imperative/cuda_graphs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index ee2ba00aea84..06fe50ef4a5f 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -304,7 +304,7 @@ class CudaGraphsSubSegExec {
                << ".dot";
       CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags));
 #else
-      static bool dot_file_unsupported = []() {
+      [[maybe_unused]] static bool dot_file_unsupported = []() {  // NOLINT
         LOG(INFO) << "MXNET_CUDA_GRAPHS_DBG_FILE setting ignored- requires CUDA version >= 11.3";
         return true;
       }();

From c609ccea31036ac1f4fe4c830232f412c97e8eff Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 9 Mar 2022 21:02:03 -0800
Subject: [PATCH 28/30] Trigger CI


From eaf61a02d3340a78cff36ba308ff5c6934ba8435 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 15 Mar 2022 13:45:48 -0700
Subject: [PATCH 29/30] Check of FCreateOpState removed given new check for
 FStatefulCompute*

---
 src/imperative/cuda_graphs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h
index 06fe50ef4a5f..c9e16d84e8b3 100644
--- a/src/imperative/cuda_graphs.h
+++ b/src/imperative/cuda_graphs.h
@@ -511,7 +511,6 @@ class CudaGraphsExec {
   // Is the Op OK to make part of a CUDA Graph?
   bool OpOK(const std::shared_ptr<exec::OpExecutor>& exec) {
     static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
-    static auto& fstateful        = Op::GetAttr<FCreateOpState>("FCreateOpState");
     static auto& fcompute_ex      = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
     static auto& fstatefulcompute = Op::GetAttr<FStatefulCompute>("FStatefulCompute<gpu>");
     static auto& fstatefulcompute_ex = Op::GetAttr<FStatefulComputeEx>("FStatefulComputeEx<gpu>");
@@ -521,8 +520,7 @@ class CudaGraphsExec {
       if (f != nullptr) {
         return f(attrs, exec->op_ctx.is_train);
       }
-      if (fstateful.get(attrs.op, nullptr) != nullptr ||
-          fstatefulcompute.get(attrs.op, nullptr) != nullptr ||
+      if (fstatefulcompute.get(attrs.op, nullptr) != nullptr ||
           fstatefulcompute_ex.get(attrs.op, nullptr) != nullptr) {
         if (verbose_) {
           LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph.";

From f4510273fed48bc7b94cbcb0cc9c9dfe4aec96d3 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 15 Mar 2022 16:55:10 -0700
Subject: [PATCH 30/30] Revert "Temporarily add '-s' to pytest serial tests"

This reverts commit 5a2f847558a7f55790f1ad1fb5ee930b4ad1a3a9.
---
 ci/docker/runtime_functions.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 69453c39e538..05f80032cd15 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -801,8 +801,7 @@ cd_unittest_ubuntu() {
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
     OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
-    # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error
-    pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
+    pytest -m 'serial' --durations=50 --verbose tests/python/unittest
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then