From f69b4a055447a5be30385d69acfa1403961e00eb Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Fri, 18 Sep 2020 20:18:50 -0700 Subject: [PATCH 01/30] [1.x][FEATURE] CUDA graphs support (#19142) * Initial cherry-pick * Store NodeAttrs in OpExecutor * Do not allow stateful operations in CUDA graphs and provide mechanism for marking ops as safe * Guard against using ops with synchronization * Cleaning * Properly guard graphs * Limit graphs to CUDA 10.2+ * Fix the compilation when graphs are not available * Guarding the libcuda.so usage behind RTC compilation flag * Document the env variables * Add test * Fix the test * Use with_environment --- docs/static_site/src/pages/api/faq/env_var.md | 10 + include/mxnet/op_attr_types.h | 13 + src/imperative/attach_op_execs_pass.cc | 57 +- src/imperative/cuda_graphs.h | 547 ++++++++++++++++++ src/imperative/exec_pass.h | 18 + src/imperative/imperative_utils.h | 21 +- src/operator/contrib/adamw.cu | 17 + src/operator/nn/dropout.cu | 8 +- src/operator/numpy/linalg/np_eig.cu | 14 +- src/operator/numpy/linalg/np_eigvals.cu | 14 +- src/operator/numpy/linalg/np_norm_backward.cu | 5 + src/operator/numpy/linalg/np_norm_forward.cu | 8 +- src/operator/numpy/np_boolean_mask_assign.cu | 8 + src/operator/numpy/np_constraint_check.cu | 4 + src/operator/numpy/np_percentile_op.cu | 7 +- src/operator/numpy/random/np_bernoulli_op.cu | 7 +- .../numpy/random/np_exponential_op.cu | 8 + src/operator/numpy/random/np_gamma_op.cu | 7 +- .../numpy/random/np_multinomial_op.cu | 4 + src/operator/numpy/random/np_normal_op.cu | 14 +- src/operator/numpy/random/np_pareto_op.cu | 7 +- src/operator/numpy/random/np_power_op.cu | 7 +- src/operator/numpy/random/np_rayleigh_op.cu | 7 +- src/operator/numpy/random/np_weibull_op.cu | 7 +- src/operator/tensor/indexing_op.cu | 7 +- tests/python/gpu/test_gluon_gpu.py | 110 +++- 26 files changed, 892 insertions(+), 44 deletions(-) create mode 100644 src/imperative/cuda_graphs.h diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md index dad481cfbf3f..1a4421d2e50f 100644 --- a/docs/static_site/src/pages/api/faq/env_var.md +++ b/docs/static_site/src/pages/api/faq/env_var.md @@ -170,6 +170,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD - Values: Int ```(default=)``` - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass. +* MXNET_ENABLE_CUDA_GRAPHS + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible. + - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True. +* MXNET_CUDA_GRAPHS_VERBOSE + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed. +* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES + - Values: Int ```(default=0)``` + - The maximum number of log messages generated by CUDA graphs executor. ## Control the Data Communication diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index 2fec1768ea86..73504bb2748b 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -357,6 +357,19 @@ using FNeedCalibrateInput = std::function(const NodeAttrs& attr */ using FNeedCalibrateOutput = std::function(const NodeAttrs& attrs)>; +#if MXNET_USE_CUDA + +/*! + * \brief Register a function to determine if + * the operator implementation is compatible + * with CUDA graphs. This requires the execution + * to stay the same as long as the shape and type + * of input stays the same. + */ +using FIsCUDAGraphsCompatible = std::function; + +#endif + } // namespace mxnet #endif // MXNET_OP_ATTR_TYPES_H_ diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc index 4a8c51d107c7..719c9b3165fe 100644 --- a/src/imperative/attach_op_execs_pass.cc +++ b/src/imperative/attach_op_execs_pass.cc @@ -47,8 +47,10 @@ namespace exec { // FComputeExecutor and FStatefulComputeExecutor inherit from this class class StorageFallbackOpExecutor : public OpExecutor { public: - explicit StorageFallbackOpExecutor(std::vector mutate_idx) - : mutate_idx_(std::move(mutate_idx)) {} + explicit StorageFallbackOpExecutor(const NodeAttrs& attrs, + DispatchMode dispatch_mode, + std::vector mutate_idx) + : OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {} void Setup() override { init_ = false; @@ -146,11 +148,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { return state_; } - explicit StatefulComputeExecutor(OpStatePtr state, + explicit StatefulComputeExecutor(const NodeAttrs& attrs, + DispatchMode dispatch_mode, + OpStatePtr state, FStatefulCompute fcompute, ExecType exec_type, const std::vector& mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), state_(std::move(state)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} @@ -168,7 +172,7 @@ class StatefulComputeExExecutor : public OpExecutor { op_ctx.run_ctx = rctx; INVALIDATE_OUTPUTS(out_array, req); std::vector* pInArray = &in_array; - CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_); + CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs); fcompute_(state_, op_ctx, *pInArray, req, out_array); } @@ -186,17 +190,17 @@ class StatefulComputeExExecutor : public OpExecutor { return state_; } - explicit StatefulComputeExExecutor(NodeAttrs attrs, + explicit StatefulComputeExExecutor(const NodeAttrs& attrs, + DispatchMode dispatch_mode, OpStatePtr state, FStatefulComputeEx fcompute, ExecType exec_type) - : attrs_(std::move(attrs)), + : OpExecutor(attrs, dispatch_mode), state_(std::move(state)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} private: - NodeAttrs attrs_; OpStatePtr state_; FStatefulComputeEx fcompute_; ExecType exec_type_; @@ -210,7 +214,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor { op_ctx.run_ctx = rctx; INVALIDATE_OUTPUTS(out_array, req); PreFCompute(is_gpu); - fcompute_(attrs_, op_ctx, in_data_, req, out_data_); + fcompute_(attrs, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); } @@ -218,17 +222,16 @@ class FComputeExecutor : public StorageFallbackOpExecutor { return exec_type_; } - explicit FComputeExecutor(NodeAttrs attrs, + explicit FComputeExecutor(const NodeAttrs& attrs, + DispatchMode dispatch_mode, FCompute fcompute, ExecType exec_type, const std::vector& mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), - attrs_(std::move(attrs)), + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} private: - NodeAttrs attrs_; FCompute fcompute_; ExecType exec_type_; }; @@ -240,8 +243,8 @@ class FComputeExExecutor : public OpExecutor { op_ctx.run_ctx = rctx; INVALIDATE_OUTPUTS(out_array, req); std::vector* pInArray = &in_array; - CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_); - fcompute_(attrs_, op_ctx, *pInArray, req, out_array); + CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs); + fcompute_(attrs, op_ctx, *pInArray, req, out_array); } void Setup() override {} @@ -250,11 +253,12 @@ class FComputeExExecutor : public OpExecutor { return exec_type_; } - explicit FComputeExExecutor(NodeAttrs attrs, FComputeEx fcompute, ExecType exec_type) + explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode, + FComputeEx fcompute, ExecType exec_type) + : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} : attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} private: - NodeAttrs attrs_; FComputeEx fcompute_; ExecType exec_type_; }; @@ -309,14 +313,15 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared( - inode.source->attrs, state, fcompute_ex, exec_type); + inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute(op, "FStatefulCompute", vctx[i]); CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared(state, fcompute, exec_type, mutate_index); + ret[i] = std::make_shared( + inode.source->attrs, dispatch_modes[i], state, fcompute, exec_type, mutate_index); } } else if (is_layer_backward.get(op, false)) { CHECK_GE(inode.control_deps.size(), 1); @@ -327,25 +332,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, common::GetFCompute(op, "FStatefulComputeEx", vctx[i]); // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared( - inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex, exec_type); + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], ret[fwd_id].get()->state(), fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute(op, "FStatefulCompute", vctx[i]); CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared( - ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index); + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type, + mutate_index); } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); FComputeEx fcomp_ex = common::GetFCompute(op, "FComputeEx", vctx[i]); if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared(inode.source->attrs, fcomp_ex, exec_type); + ret[i] = std::make_shared( + inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type); } else if (fcompute != nullptr) { ret[i] = std::make_shared( - inode.source->attrs, fcompute, exec_type, mutate_index); + inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index); } else { LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name; } diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h new file mode 100644 index 000000000000..e669d7d1d2e3 --- /dev/null +++ b/src/imperative/cuda_graphs.h @@ -0,0 +1,547 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file cuda_graphs.h + * \brief Wrappers for use of CUDA Graphs API + */ +#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_ +#define MXNET_EXECUTOR_CUDA_GRAPHS_H_ + +#include +#include +#include +#include +#include + +#include "./exec_pass.h" +#include "../common/cuda_utils.h" + +#if MXNET_USE_CUDA +#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020) +#else +#define CUDA_GRAPHS_AVAILABLE (0) +#endif + +#if CUDA_GRAPHS_AVAILABLE + +namespace mxnet { +namespace cuda_graphs { + +inline std::string CudaDim3ToString(const dim3& dims) { + std::stringstream ss; + if (dims.z != 1) + ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")"; + else if (dims.y != 1) + ss << "(" << dims.x << "," << dims.y << ")"; + else + ss << "(" << dims.x << ")"; + return ss.str(); +} + +// Return the list of CUDA Graph nodes from a graph +inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { + size_t numNodes; + CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast(nullptr), &numNodes)); + if (numNodes == 0) + return std::vector(); + std::vector graphNodes(numNodes); + CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes)); + return graphNodes; +} + +// It does not really involve RTC, but requires libcuda.so, +// which is linked only when RTC is enabled. +#if MXNET_ENABLE_CUDA_RTC + +inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { + std::stringstream ss; + + // The following introspection calls are made through the driver API in order to bypass + // problems that would arise if multiple statically-linked copies of the runtime exist. + + CUgraphNode cu_node = node; + CUgraphNodeType t; + CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); + switch (t) { + case CU_GRAPH_NODE_TYPE_KERNEL: + { + CUDA_KERNEL_NODE_PARAMS kparams; + auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); + if (err == CUDA_SUCCESS) { + ss << "GPUKernel@" << kparams.func; + dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); + dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); + ss << "<<>>"; + ss << "(..."; + if (kparams.sharedMemBytes != 0) + ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; + ss << ")"; + } else { + ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; + } + } + break; + case CU_GRAPH_NODE_TYPE_MEMCPY: + { + cudaMemcpy3DParms mparams = {}; + CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); + // If memcpy is seen, return without setting up runnable executor + switch (mparams.kind) { + case cudaMemcpyHostToHost: ss << "Host->Host "; break; + case cudaMemcpyHostToDevice: ss << "Host->Device "; break; + case cudaMemcpyDeviceToHost: ss << "Device->Host "; break; + case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break; + default: break; + } + ss << "Memcpy"; + } + break; + case CU_GRAPH_NODE_TYPE_MEMSET: + { + cudaMemsetParams mparams = {}; + CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); + if (mparams.height == 1 && mparams.elementSize == 1) { + ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value + << ", count=" << mparams.width << ")"; + } else { + if (mparams.elementSize == 1) + ss << "cudaMemset2D"; + else + ss << "MemSet"; + ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch + << ", value=" << mparams.value << ", width=" << mparams.width + << ", height=" << mparams.height << ")"; + } + } + break; + case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break; + case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break; + case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break; + default: ss << "Unknown/Invalid node type " << t; + } + return ss.str(); +} + +#endif // MXNET_ENABLE_CUDA_RTC + +// CUDA Graphs are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphDeleter { + public: + void operator() (cudaGraph_t graph) { + if (graph != nullptr) + CUDA_CALL(cudaGraphDestroy(graph)); + } +}; + +// CUDA Graphs Executors are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphExecDeleter { + public: + void operator() (cudaGraphExec_t graph_exec) { + if (graph_exec != nullptr) + CUDA_CALL(cudaGraphExecDestroy(graph_exec)); + } +}; + +// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'), +// characterized by a starting index in the OpExecutor list and a number of ops. +class CudaGraphsSubSegExec { + public: + CudaGraphsSubSegExec(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops, + bool ops_are_cuda_graph_compatible = true) : + from_op_idx_(from_op_idx), + num_ops_(num_ops), + graph_(nullptr), + graph_exec_(nullptr) { + if (ops_are_cuda_graph_compatible) { + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); + MakeGraphExec(); + } + } + + void Update(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose) { + // Current executor should be Runnable with the same parameters + CHECK(IsRunnable()); + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_); + + cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; + cudaGraphNode_t error_node; + CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), + &error_node, &update_result)); + // If update fails make a new executor, discarding old one. + if (update_result != cudaGraphExecUpdateSuccess) + MakeGraphExec(); + } + + void RunSubSeg(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + if (IsRunnable()) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s)); + } else { + // No CUDA Graph could be made for this portion of the OpSegment. Run conventionally. + for (int i = 0; i != num_ops_; ++i) + exec_list[from_op_idx_ + i]->Run(rctx, is_gpu); + } + } + + bool IsRunnable() { return graph_exec_ != nullptr; } + + private: + void MakeGraph(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // Create CUDA Graph + // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers + // to sync their streams without disturbing this capture. + CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal)); + // Run those oprs in the sub segment while capturing- no actual GPU work is launched. + for (int i = 0; i != num_ops; ++i) + exec_list[from_op_idx + i]->Run(rctx, is_gpu); + cudaGraph_t cuda_graph = nullptr; + CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph)); + graph_.reset(cuda_graph, CudaGraphDeleter()); + + if (verbose) { + std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); + size_t num_nodes = graph_nodes.size(); + LOG(INFO) << " Graph has " << num_nodes << " nodes:"; +#if MXNET_ENABLE_CUDA_RTC + for (size_t i = 0; i != num_nodes; ++i) { + LOG(INFO) << " node " << i << " = " + << CudaGraphNodeToString(graph_nodes[i]); + } +#endif // MXNET_ENABLE_CUDA_RTC + } + } + + void MakeGraphExec() { + cudaGraphExec_t cuda_graph_exec; + cudaGraphNode_t error_node; + char log_buffer[1000]; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), + &error_node, log_buffer, 1000)); + graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); + + // At this point we have a CUDA Graph executor + static int num_graph_creations_logged = 0; + static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); + if (num_graph_creations_logged < max_log_entries) { + num_graph_creations_logged++; + LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; + if (num_graph_creations_logged == max_log_entries) + LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; + } + } + + int from_op_idx_; + int num_ops_; + using cudaGraphStruct_t = typename std::remove_pointer::type; + using cudaGraphExecStruct_t = typename std::remove_pointer::type; + std::shared_ptr graph_; + std::shared_ptr graph_exec_; +}; + +// The CudaGraph executor and associated Tempspace ptrs for which it is valid. +struct CudaGraphInfo { + std::vector cuda_graph_subseg_execs; + bool has_been_run_conventionally = false; + std::vector tempspace_dptrs; +}; +// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and +// the state of the is_train flag of the OpContext. If the tempspace_dptrs change, we +// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph. +struct CudaGraphCacheKey { + cudaStream_t cu_s; + bool is_train; + // overload '<' so CudaGraphCacheKey can be used as a std::map key + bool operator<(const CudaGraphCacheKey &other) const { + return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train); + } +}; +using CudaGraphCache = std::map; + +class CudaGraphsExec { + public: + CudaGraphsExec(const std::vector > &exec_list, + bool is_gpu, + const char *opr_names) : + verbose_(false), is_enabled_(false) { + opr_names_ = opr_names ? std::string(opr_names) : std::string(); + if (is_gpu) { + is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false); + verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); + SetTempSpaces(exec_list); + } + } + + void RunAll(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + // If this a CPU op or CUDA Graphs use isn't possible, run normally and return + if (!is_gpu || !is_enabled_) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + return; + } + + // Also if we're in a warm-up period where tempspace pointers are likely + // to change, run normally and return + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // All the ops in the bulked segment will have the same setting of is_train as the first op + const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; + const CudaGraphCacheKey key = {cu_s, is_train}; + // Look-up the CUDA Graph info for this combo of stream and is_train setting + // This may create a default-initialized new entry. + auto &cuda_graph_info = cache_[key]; + if (!cuda_graph_info.has_been_run_conventionally) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + cuda_graph_info.has_been_run_conventionally = true; + return; + } + + // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors' + // (there might be more than one executor if some ops in the segment are not capturable) + auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s); + + // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'. + if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 && + cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) { + // Update all runnable executors. Non-runnable executors launch their ops conventionally. + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + subseg_exec.Update(exec_list, rctx, is_gpu, verbose_); + } + } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) { + // No executors exist yet, so create them. + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_; + // Make one or more CUDA Graphs, avoiding ops that are not compatible. + for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) { + if (OpOK(exec_list[last_op_idx])) + num_good_ops++; + else + break; + } + if (num_good_ops > 0) { + CreateSubExecOverRegion(exec_list, rctx, is_gpu, + first_op_idx, + first_op_idx + num_good_ops, + &cuda_graph_info.cuda_graph_subseg_execs); + first_op_idx += num_good_ops; + } + if (first_op_idx != exec_list.size()) { + // We had to have hit an op that was not OK. + if (verbose_) { + LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]" + << " of op segment " << opr_names_; + } + CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false); + cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg); + first_op_idx++; + } + } + // During graph capture, the ops may be asking for the tempworkspace. This should + // not alter the base pointers, since this op seg has been executed before on this + // stream (i.e. on this gpu worker). Safest to double-check this though. + auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s); + if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs) + LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use."; + cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs; + } + // Now execute the CUDA Graph that we either just created or looked-up in the cache. + if (verbose_) { + int runnable_execs = 0; + int bypassed_ops = 0; + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + runnable_execs++; + else + bypassed_ops++; + } + LOG(INFO) << "Launching " << runnable_execs + << " captured CUDA graph(s) for op segment " << opr_names_; + if (bypassed_ops > 0) + LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; + } + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) + subseg_exec.RunSubSeg(exec_list, rctx, is_gpu); + } + + private: + // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph + // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a + // number of smaller graphs that avoid those ops with the memcpys. + void CreateSubExecOverRegion(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + size_t from_op_idx, + size_t upto_op_idx, + std::vector *cuda_graph_subseg_execs) { + // Optimistically try to create a CUDA Graph of the entire op segment region + + int num_ops = upto_op_idx - from_op_idx; + CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops); + if (full_opseg.IsRunnable()) { + cuda_graph_subseg_execs->push_back(full_opseg); + } else { + if (verbose_) + LOG(INFO) << " Graph was not runnable- creating op sub-segments..."; + // Enter fall-back approach to making many sub-execs + for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) { + CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1); + if (single_opseg.IsRunnable()) + num_good_ops++; + // Is it time to create a subseg exec from accumulated good ops? + if (num_good_ops > 0 && + (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op sub segment[" + << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]" + << " of op segment " << opr_names_; + CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_, + first_op_idx, num_good_ops); + CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; + cuda_graph_subseg_execs->push_back(good_opseg); + first_op_idx += num_good_ops; + } + // If the last single op was not runnable, use the exec to handle that op conventionally + if (!single_opseg.IsRunnable()) { + if (verbose_) { + LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]" + << " of op segment " << opr_names_; + // Generate throw-away exec in order to produce a diagnostic listing of graph nodes + CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1); + } + cuda_graph_subseg_execs->push_back(single_opseg); + first_op_idx++; + break; + } + } + } + } + } + + // Is the Op OK to make part of a CUDA Graph? + bool OpOK(const std::shared_ptr &exec) { + static auto& fstateful = Op::GetAttr("FCreateOpState"); + static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); + const auto& attrs = exec->attrs; + if (attrs.op != nullptr) { + const auto f = fgraphcompatible.get(attrs.op, nullptr); + if (f != nullptr) { + return f(attrs, exec->op_ctx.is_train); + } + if (fstateful.get(attrs.op, nullptr) != nullptr) { + if (verbose_) { + LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; + } + return false; + } + if ((fcompute_ex.get(attrs.op, nullptr) != nullptr && + exec->dispatch_mode == DispatchMode::kFComputeEx) || + exec->dispatch_mode == DispatchMode::kFComputeFallback) { + if (verbose_) { + LOG(INFO) << "Omitting operator " << attrs.op->name + << " from CUDA graph due to dispatch mode " + << static_cast(exec->dispatch_mode); + } + return false; + } + } + for (auto& resource : exec->op_ctx.requested) { + if (!(resource.req.type == ResourceRequest::kTempSpace)) { + if (verbose_) { + LOG(INFO) << "Omitting operator " << attrs.op->name + << " from CUDA graph due to using the resource type " + << static_cast(resource.req.type); + } + return false; + } + } + return true; + } + + // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. + void SetTempSpaces(const std::vector > &exec_list) { + // Gather info about the ops use of TempSpace. + if (is_enabled_) { + std::set tempspaces_set; + for (auto& exec : exec_list) { + for (auto& resource : exec->op_ctx.requested) { + if (resource.req.type == ResourceRequest::kTempSpace) { + tempspaces_set.insert(&resource); + } + } + } + tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end()); + } + } + + // Return the addresses of the gpu TempSpace areas + std::vector GetGPUTempspacePtrs(mshadow::Stream *s) { + std::vector ret; + for (const auto& resource : tempspaces_) { + // Ask for minimal allocation to get base pointer without increasing the size + auto *base_ptr = resource->get_space_typed(mshadow::Shape1(1), s).dptr_; + ret.push_back(static_cast(base_ptr)); + } + return ret; + } + + CudaGraphCache cache_; + std::vector tempspaces_; + std::string opr_names_; + bool verbose_; + bool is_enabled_; +}; + +} // namespace cuda_graphs +} // namespace mxnet + +#endif // CUDA_GRAPHS_AVAILABLE + +#endif // MXNET_EXECUTOR_CUDA_GRAPHS_H_ diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h index 7667d97632fc..6da6c8d95c69 100644 --- a/src/imperative/exec_pass.h +++ b/src/imperative/exec_pass.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,13 @@ class OpExecutor { std::vector req; /*! \brief runtime op context, contains allocated resources */ OpContext op_ctx; + /*! \brief attributes of the node */ + NodeAttrs attrs; + /*! \brief dispatch mode of the executor */ + DispatchMode dispatch_mode; + + explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode) : + attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {} /*! \brief virtual destructor */ virtual ~OpExecutor() {} /*! @@ -98,6 +106,16 @@ class OpExecutor { * \param rctx The runtime context passed in by environment. */ virtual void Run(RunContext rctx, bool is_gpu) = 0; + /*! + * \brief run the operators of a vector of execs, given runtime context on device. + * This function call does not synchronize the stream. + * \param rctx The runtime context passed in by environment. + */ + static void RunAll(const std::vector > &execs, + RunContext rctx, bool is_gpu) { + for (auto &exec : execs) + exec->Run(rctx, is_gpu); + } /*! \return the execution type */ virtual ExecType exec_type() const = 0; /*! \return return engine variable for operator states */ diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index ce1a60fb2b20..bd9fe783ab28 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -27,6 +27,8 @@ #include #include +#include "./exec_pass.h" +#include "./cuda_graphs.h" #include "../c_api/c_api_common.h" #include "../common/exec_utils.h" #include "../common/utils.h" @@ -1248,6 +1250,21 @@ inline Engine::OprHandle CreateEngineOp( bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask; bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync; +#if CUDA_GRAPHS_AVAILABLE + // Provide initialized `cuda_graphs_exec`, which when captured + // by exec_fun, acts like a static variable inside the mutable closure. + cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names); + auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] ( + RunContext ctx, + Engine::CallbackOnStart on_start, + Engine::CallbackOnComplete on_complete) mutable { + on_start(); + if (is_async) { + execs[0]->op_ctx.async_on_complete = on_complete; + } + // Run all opr in the sub-graph with CUDA graphs executor if possible + cuda_graphs_exec.RunAll(execs, ctx, is_gpu); +#else auto exec_fun = [execs, is_async, is_gpu](RunContext ctx, Engine::CallbackOnStart on_start, Engine::CallbackOnComplete on_complete) { @@ -1255,8 +1272,8 @@ inline Engine::OprHandle CreateEngineOp( if (is_async) { execs[0]->op_ctx.async_on_complete = on_complete; } - for (const auto& exec : execs) - exec->Run(ctx, is_gpu); + exec::OpExecutor::RunAll(execs, ctx, is_gpu); +#endif // call on complete only if it is async op if (!is_async) { if (is_gpu) { diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu index 802378839bc2..33fca6ccea67 100644 --- a/src/operator/contrib/adamw.cu +++ b/src/operator/contrib/adamw.cu @@ -45,17 +45,34 @@ void GetScaleFloat(mshadow::Stream* s, const TBlob& scale_blob, float* })} NNVM_REGISTER_OP(_adamw_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", adamw::MPUpdate>); NNVM_REGISTER_OP(_mp_adamw_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", adamw::MPUpdate>); NNVM_REGISTER_OP(_multi_adamw_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", adamw::multiMPUpdate); NNVM_REGISTER_OP(_multi_mp_adamw_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", adamw::multiMPUpdate); + } // namespace adamw } // namespace op } // namespace mxnet diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index d6c97f5f09fc..bff9b020126c 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -28,7 +28,13 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(Dropout).set_attr("FStatefulCompute", DropoutCompute); +NNVM_REGISTER_OP(Dropout) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool is_train) { + // Dropout is just passthrough during inference + return !is_train; + }) + .set_attr("FStatefulCompute", DropoutCompute); NNVM_REGISTER_OP(_backward_Dropout) .set_attr("FStatefulCompute", DropoutGradCompute); diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu index 1f89106bab47..ab5c6644501d 100644 --- a/src/operator/numpy/linalg/np_eig.cu +++ b/src/operator/numpy/linalg/np_eig.cu @@ -28,11 +28,21 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_eig).set_attr("FCompute", EigOpForward); +NNVM_REGISTER_OP(_npi_eig) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", EigOpForward); #if MXNET_USE_CUSOLVER == 1 -NNVM_REGISTER_OP(_npi_eigh).set_attr("FCompute", EighOpForward); +NNVM_REGISTER_OP(_npi_eigh) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", EighOpForward); #endif diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu index dc03805c54d0..94a007d7a245 100644 --- a/src/operator/numpy/linalg/np_eigvals.cu +++ b/src/operator/numpy/linalg/np_eigvals.cu @@ -28,11 +28,21 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_eigvals).set_attr("FCompute", EigvalsOpForward); +NNVM_REGISTER_OP(_npi_eigvals) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", EigvalsOpForward); #if MXNET_USE_CUSOLVER == 1 -NNVM_REGISTER_OP(_npi_eigvalsh).set_attr("FCompute", EigvalshOpForward); +NNVM_REGISTER_OP(_npi_eigvalsh) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", EigvalshOpForward); #endif diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu index 24d8783dba33..e88d717bd580 100644 --- a/src/operator/numpy/linalg/np_norm_backward.cu +++ b/src/operator/numpy/linalg/np_norm_backward.cu @@ -26,6 +26,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_backward_npi_norm) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeBackward); } // namespace op diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu index 89267632d898..26a87c580010 100644 --- a/src/operator/numpy/linalg/np_norm_forward.cu +++ b/src/operator/numpy/linalg/np_norm_forward.cu @@ -25,7 +25,13 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_norm).set_attr("FCompute", NumpyNormComputeForward); +NNVM_REGISTER_OP(_npi_norm) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) + .set_attr("FCompute", NumpyNormComputeForward); } // namespace op } // namespace mxnet diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu index 10f8612a3ffb..8e1bd57386cd 100644 --- a/src/operator/numpy/np_boolean_mask_assign.cu +++ b/src/operator/numpy/np_boolean_mask_assign.cu @@ -273,9 +273,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs, } NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); } // namespace op diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu index 04a0a36f4043..8d622d152bac 100644 --- a/src/operator/numpy/np_constraint_check.cu +++ b/src/operator/numpy/np_constraint_check.cu @@ -38,6 +38,10 @@ void GetReduceOutput(mshadow::Stream* s, const TBlob& output_blob, boo } NNVM_REGISTER_OP(_npx_constraint_check) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", ConstraintCheckForward); } // namespace op diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu index 13d076dd9b53..7ff67ad3c9c5 100644 --- a/src/operator/numpy/np_percentile_op.cu +++ b/src/operator/numpy/np_percentile_op.cu @@ -52,7 +52,12 @@ bool CheckInvalidInput(mshadow::Stream* s, return is_valid == 0; } -NNVM_REGISTER_OP(_npi_percentile).set_attr("FCompute", NumpyPercentileForward); +NNVM_REGISTER_OP(_npi_percentile) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyPercentileForward); } // namespace op } // namespace mxnet diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu index 8cdceb5bb4c8..0cf9bd95ab7c 100644 --- a/src/operator/numpy/random/np_bernoulli_op.cu +++ b/src/operator/numpy/random/np_bernoulli_op.cu @@ -27,7 +27,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_bernoulli).set_attr("FCompute", NumpyBernoulliForward); +NNVM_REGISTER_OP(_npi_bernoulli) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyBernoulliForward); } // namespace op } // namespace mxnet diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu index 60809fbb91c5..7a2068dad8a1 100644 --- a/src/operator/numpy/random/np_exponential_op.cu +++ b/src/operator/numpy/random/np_exponential_op.cu @@ -28,7 +28,15 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_exponential) +<<<<<<< HEAD .set_attr("FCompute", NumpyExponentialForward); +======= +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) +.set_attr("FCompute", NumpyExponentialForward); +>>>>>>> f4bcd48dd... [1.x][FEATURE] CUDA graphs support (#19142) NNVM_REGISTER_OP(_backward_broadcast_exponential) .set_attr("FCompute", ExponentialReparamBackward); diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu index 7e3cabc3a83f..8bfc61aad7ab 100644 --- a/src/operator/numpy/random/np_gamma_op.cu +++ b/src/operator/numpy/random/np_gamma_op.cu @@ -28,7 +28,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_gamma).set_attr("FCompute", NumpyGammaForward); +NNVM_REGISTER_OP(_npi_gamma) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyGammaForward); NNVM_REGISTER_OP(_backward_gamma_sample).set_attr("FCompute", NumpyGammaGrad); diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu index 083b410a2d8a..ee77b79c6d91 100644 --- a/src/operator/numpy/random/np_multinomial_op.cu +++ b/src/operator/numpy/random/np_multinomial_op.cu @@ -41,6 +41,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) { } NNVM_REGISTER_OP(_npi_multinomial) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyMultinomialForward); } // namespace op diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu index db8746165c6e..3d310f82b20d 100644 --- a/src/operator/numpy/random/np_normal_op.cu +++ b/src/operator/numpy/random/np_normal_op.cu @@ -27,12 +27,22 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_normal).set_attr("FCompute", NumpyNormalForward); +NNVM_REGISTER_OP(_npi_normal) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyNormalForward); NNVM_REGISTER_OP(_backward_broadcast_normal) .set_attr("FCompute", NormalReparamBackward); -NNVM_REGISTER_OP(_npi_normal_n).set_attr("FCompute", NumpyNormalForward); +NNVM_REGISTER_OP(_npi_normal_n) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyNormalForward); } // namespace op } // namespace mxnet diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu index 7618d2871099..59900a0090e8 100644 --- a/src/operator/numpy/random/np_pareto_op.cu +++ b/src/operator/numpy/random/np_pareto_op.cu @@ -27,7 +27,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_pareto).set_attr("FCompute", NumpyParetoForward); +NNVM_REGISTER_OP(_npi_pareto) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyParetoForward); NNVM_REGISTER_OP(_backward_broadcast_pareto) .set_attr("FCompute", ParetoReparamBackward); diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu index 290442037eee..f5bcb1000771 100644 --- a/src/operator/numpy/random/np_power_op.cu +++ b/src/operator/numpy/random/np_power_op.cu @@ -27,7 +27,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_powerd).set_attr("FCompute", NumpyPowerForward); +NNVM_REGISTER_OP(_npi_powerd) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyPowerForward); } // namespace op } // namespace mxnet diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu index 586f17481e30..c17d6c2bdd79 100644 --- a/src/operator/numpy/random/np_rayleigh_op.cu +++ b/src/operator/numpy/random/np_rayleigh_op.cu @@ -27,7 +27,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_rayleigh).set_attr("FCompute", NumpyRayleighForward); +NNVM_REGISTER_OP(_npi_rayleigh) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyRayleighForward); NNVM_REGISTER_OP(_backward_broadcast_rayleigh) .set_attr("FCompute", RayleighReparamBackward); diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu index 658be16e6333..62c0a564c060 100644 --- a/src/operator/numpy/random/np_weibull_op.cu +++ b/src/operator/numpy/random/np_weibull_op.cu @@ -27,7 +27,12 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_weibull).set_attr("FCompute", NumpyWeibullForward); +NNVM_REGISTER_OP(_npi_weibull) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyWeibullForward); NNVM_REGISTER_OP(_backward_broadcast_weibull) .set_attr("FCompute", WeibullReparamBackward); diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu index 90504301cc22..f4ce8ebcfd8a 100644 --- a/src/operator/tensor/indexing_op.cu +++ b/src/operator/tensor/indexing_op.cu @@ -957,7 +957,12 @@ NNVM_REGISTER_OP(batch_take).set_attr("FCompute", BatchTakeOpForw NNVM_REGISTER_OP(one_hot).set_attr("FCompute", OneHotOpForward); -NNVM_REGISTER_OP(gather_nd).set_attr("FCompute", GatherNDForwardGPU); +NNVM_REGISTER_OP(gather_nd) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", GatherNDForwardGPU); NNVM_REGISTER_OP(scatter_nd).set_attr("FCompute", ScatterNDForward); diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 492055f4aef2..3e54c5ed647a 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -18,6 +18,7 @@ import sys import os import time +import random import mxnet as mx import multiprocessing as mp from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, rand_ndarray, environment @@ -28,7 +29,7 @@ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.insert(0, os.path.join(curr_path, '../unittest')) -from common import assert_raises_cudnn_not_satisfied, run_in_spawned_process +from common import assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed from test_gluon import * from test_loss import * from test_numpy_loss import * @@ -595,3 +596,110 @@ def test_cudnn_dropout_reproducibility(): assert_almost_equal(a.grad, b.grad) +def test_cuda_graphs(): + class GraphTester(gluon.HybridBlock): + def __init__(self, function_to_test, **kwargs): + super(GraphTester, self).__init__(**kwargs) + with self.name_scope(): + self.f = function_to_test() + + def hybrid_forward(self, F, *args): + # We need to isolate the operation to be fully inside the graph + # in order for graphs usage to be possible + copied_args = [F.identity(a) for a in args] + outputs = self.f(*copied_args) + if isinstance(outputs, (list, tuple)): + return [F.identity(o) for o in outputs] + else: + return F.identity(outputs) + + class TestDesc: + def __init__(self, name, f, num_inputs=1, input_dim=4): + self.name = name + self.f = f + self.num_inputs = num_inputs + self.input_dim = input_dim + + def generate_inputs(self): + shape = tuple(np.random.randint(4, 11, size=self.input_dim)) + ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)] + for r in ret: + r.attach_grad() + return ret + + tested_ops = [ + TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2), + TestDesc('add_scalar', lambda: (lambda x: x + 0.5)), + TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))), + TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))), + TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), + TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')), + #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), + TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()), + TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()), + TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()), + TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()), + TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()), + TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()), + TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()), + TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()), + TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()), + TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)), + TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()), + TestDesc('ELU', lambda: mx.gluon.nn.ELU()), + TestDesc('SELU', lambda: mx.gluon.nn.SELU()), + TestDesc('Swish', lambda: mx.gluon.nn.Swish()), + ] + + N = 10 + + with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', + 'MXNET_USE_FUSION': '0'}): + for test_desc in tested_ops: + print("Testing ", test_desc.name) + inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) + + # initialize parameters + net.initialize() + netg.initialize() + + net(*inputs) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) + + netg.hybridize(static_alloc=True, static_shape=True) + + print("Testing inference mode") + with random_seed(seed): + for _ in range(N): + assert_almost_equal(net(*inputs), netg(*inputsg)) + + mx.nd.waitall() + print("Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) + mx.nd.waitall() From 9ca54abd1eb874ee8ba5e0362269daadeed1d149 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 9 Jun 2021 18:31:50 -0700 Subject: [PATCH 02/30] Fix compile and test_cuda_graphs --- src/imperative/cuda_graphs.h | 2 +- tests/python/gpu/test_gluon_gpu.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index e669d7d1d2e3..2dbebdffab54 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -32,7 +32,7 @@ #include #include "./exec_pass.h" -#include "../common/cuda_utils.h" +#include "../common/cuda/utils.h" #if MXNET_USE_CUDA #define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 3e54c5ed647a..f58322d101bb 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -600,8 +600,7 @@ def test_cuda_graphs(): class GraphTester(gluon.HybridBlock): def __init__(self, function_to_test, **kwargs): super(GraphTester, self).__init__(**kwargs) - with self.name_scope(): - self.f = function_to_test() + self.f = function_to_test() def hybrid_forward(self, F, *args): # We need to isolate the operation to be fully inside the graph From 0b4ed478d159385f85d9a3a3fffe56015cf203fd Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 9 Jun 2021 20:19:26 -0700 Subject: [PATCH 03/30] Fix lint --- src/imperative/cuda_graphs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 2dbebdffab54..e68d6a6a41d9 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -22,8 +22,8 @@ * \file cuda_graphs.h * \brief Wrappers for use of CUDA Graphs API */ -#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_ -#define MXNET_EXECUTOR_CUDA_GRAPHS_H_ +#ifndef MXNET_IMPERATIVE_CUDA_GRAPHS_H_ +#define MXNET_IMPERATIVE_CUDA_GRAPHS_H_ #include #include @@ -544,4 +544,4 @@ class CudaGraphsExec { #endif // CUDA_GRAPHS_AVAILABLE -#endif // MXNET_EXECUTOR_CUDA_GRAPHS_H_ +#endif // MXNET_IMPERATIVE_CUDA_GRAPHS_H_ From cc694868a6678ed6157b80555e4e00ca3e187cf3 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 24 Jun 2021 18:56:53 -0700 Subject: [PATCH 04/30] Mark more ops as not CUDA Graphs compatible --- src/imperative/cuda_graphs.h | 9 +-------- src/operator/contrib/index_array.cu | 7 ++++++- src/operator/contrib/multi_lamb.cu | 8 ++++++++ src/operator/numpy/np_matrix_op.cu | 16 +++++++++++++--- src/operator/numpy/np_nonzero_op.cu | 4 ++++ src/operator/tensor/elemwise_unary_op_basic.cu | 7 ++++++- src/operator/tensor/matrix_op.cu | 16 +++++++++++++--- 7 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index e68d6a6a41d9..bab3a062d314 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -67,10 +67,7 @@ inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { return graphNodes; } -// It does not really involve RTC, but requires libcuda.so, -// which is linked only when RTC is enabled. -#if MXNET_ENABLE_CUDA_RTC - +// Create a description of a CUDA Graph node inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { std::stringstream ss; @@ -141,8 +138,6 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { return ss.str(); } -#endif // MXNET_ENABLE_CUDA_RTC - // CUDA Graphs are managed in RAII fashion by smart pointers below. // Function objects (preferred for readability) provide the deleter function. class CudaGraphDeleter { @@ -241,12 +236,10 @@ class CudaGraphsSubSegExec { std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); size_t num_nodes = graph_nodes.size(); LOG(INFO) << " Graph has " << num_nodes << " nodes:"; -#if MXNET_ENABLE_CUDA_RTC for (size_t i = 0; i != num_nodes; ++i) { LOG(INFO) << " node " << i << " = " << CudaGraphNodeToString(graph_nodes[i]); } -#endif // MXNET_ENABLE_CUDA_RTC } } diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu index 482cbf6b8150..d317ccdf7059 100644 --- a/src/operator/contrib/index_array.cu +++ b/src/operator/contrib/index_array.cu @@ -82,7 +82,12 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs& attrs, } } -NNVM_REGISTER_OP(_contrib_index_array).set_attr("FCompute", IndexArrayForwardGPU); +NNVM_REGISTER_OP(_contrib_index_array) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) + .set_attr("FCompute", IndexArrayForwardGPU); } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu index 118ec6348ed7..8361e04cabb3 100644 --- a/src/operator/contrib/multi_lamb.cu +++ b/src/operator/contrib/multi_lamb.cu @@ -268,9 +268,17 @@ void CallKernel2(Stream* s, } NNVM_REGISTER_OP(_multi_lamb_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) .set_attr("FCompute", MultiLAMBUpdate); NNVM_REGISTER_OP(_multi_mp_lamb_update) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) .set_attr("FCompute", MultiLAMBUpdate); } // namespace op diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu index f2078146c78e..23b149973f6c 100644 --- a/src/operator/numpy/np_matrix_op.cu +++ b/src/operator/numpy/np_matrix_op.cu @@ -92,9 +92,19 @@ void NumpyFlipForwardImpl(const OpContext& ctx, }); } -NNVM_REGISTER_OP(_npi_flip).set_attr("FCompute", NumpyFlipForward); - -NNVM_REGISTER_OP(_backward_npi_flip).set_attr("FCompute", NumpyFlipForward); +NNVM_REGISTER_OP(_npi_flip) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) + .set_attr("FCompute", NumpyFlipForward); + +NNVM_REGISTER_OP(_backward_npi_flip) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) + .set_attr("FCompute", NumpyFlipForward); NNVM_REGISTER_OP(_npi_moveaxis).set_attr("FCompute", NumpyMoveaxisCompute); diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu index 1499030dbe9b..8ae39bdf2c4c 100644 --- a/src/operator/numpy/np_nonzero_op.cu +++ b/src/operator/numpy/np_nonzero_op.cu @@ -115,6 +115,10 @@ NNVM_REGISTER_OP(_npx_nonzero) [](const NodeAttrs& attrs) { return std::vector{ResourceRequest::kTempSpace}; }) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) .set_attr("FComputeEx", NonzeroForwardGPU); } // namespace op diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu index 7fdc047630cb..baa36c6be587 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cu +++ b/src/operator/tensor/elemwise_unary_op_basic.cu @@ -115,7 +115,12 @@ void ShapeComputeGPU(const nnvm::NodeAttrs& attrs, mshadow::Stream::GetStream(s)); } -NNVM_REGISTER_OP(shape_array).set_attr("FCompute", ShapeComputeGPU); +NNVM_REGISTER_OP(shape_array) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", ShapeComputeGPU); void SizeComputeGPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index b5bd1c96d25b..dc7d608dacb9 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -412,9 +412,19 @@ NNVM_REGISTER_OP(tile).set_attr("FCompute", TileOpForward); NNVM_REGISTER_OP(_backward_tile).set_attr("FCompute", TileOpBackward); -NNVM_REGISTER_OP(reverse).set_attr("FCompute", ReverseOpForward); - -NNVM_REGISTER_OP(_backward_reverse).set_attr("FCompute", ReverseOpForward); +NNVM_REGISTER_OP(reverse) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", ReverseOpForward); + +NNVM_REGISTER_OP(_backward_reverse) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", ReverseOpForward); NNVM_REGISTER_OP(stack).set_attr("FCompute", StackOpForward); From e79c111a1776635c92b6e900c37674bed531907a Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 27 May 2021 20:59:00 -0700 Subject: [PATCH 05/30] Mark some linalg ops as not CUDA Graphs compatible --- src/imperative/cuda_graphs.h | 18 ++++++++---- src/operator/numpy/np_matrix_op.cu | 29 +++++++++++++++++-- src/operator/tensor/la_op.cu | 46 ++++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index bab3a062d314..97424cfac074 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -189,11 +189,19 @@ class CudaGraphsSubSegExec { cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; cudaGraphNode_t error_node; - CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), - &error_node, &update_result)); - // If update fails make a new executor, discarding old one. - if (update_result != cudaGraphExecUpdateSuccess) - MakeGraphExec(); + cudaError_t e = cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), + &error_node, &update_result); + switch (e) { + case cudaErrorGraphExecUpdateFailure: + MakeGraphExec(); + break; + case cudaSuccess: + CHECK_EQ(update_result, cudaGraphExecUpdateSuccess); + break; + default: + // Respond normally to unusual cudaGraphExecUpdate() ret vals + CUDA_CALL(e); + } } void RunSubSeg(const std::vector > &exec_list, diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu index 23b149973f6c..ed9ab81f2066 100644 --- a/src/operator/numpy/np_matrix_op.cu +++ b/src/operator/numpy/np_matrix_op.cu @@ -52,9 +52,19 @@ NNVM_REGISTER_OP(_npi_column_stack) NNVM_REGISTER_OP(_backward_np_column_stack) .set_attr("FCompute", NumpyColumnStackBackward); -NNVM_REGISTER_OP(_npi_tril_indices).set_attr("FCompute", TrilindicesOpForward); +NNVM_REGISTER_OP(_npi_tril_indices) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) + .set_attr("FCompute", TrilindicesOpForward); -NNVM_REGISTER_OP(_npi_roll).set_attr("FCompute", NumpyRollCompute); +NNVM_REGISTER_OP(_npi_roll) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + return false; + }) + .set_attr("FCompute", NumpyRollCompute); template <> void NumpyFlipForwardImpl(const OpContext& ctx, @@ -113,7 +123,20 @@ NNVM_REGISTER_OP(_npi_rollaxis).set_attr("FCompute", NumpyRollaxi NNVM_REGISTER_OP(_npi_rollaxis_backward) .set_attr("FCompute", NumpyRollaxisBackward); -NNVM_REGISTER_OP(_npi_rot90).set_attr("FCompute", NumpyRot90Compute); +NNVM_REGISTER_OP(_npi_rot90) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const auto& param = nnvm::get(attrs.parsed); + // Should track code in NumpyRot90Compute() + int real_k(param.k); + real_k = real_k % 4; + if (real_k < 0) { + real_k += 4; + } + // Avoid NumpyRot90ComputeFlipIml(), which uses mshadow::Copy() + return real_k != 2; + }) + .set_attr("FCompute", NumpyRot90Compute); NNVM_REGISTER_OP(_npi_hsplit).set_attr("FCompute", HSplitOpForward); diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu index 1f16e2d58251..8cae4a4d5be8 100644 --- a/src/operator/tensor/la_op.cu +++ b/src/operator/tensor/la_op.cu @@ -88,6 +88,10 @@ NNVM_REGISTER_OP(_backward_linalg_maketrian) .set_attr("FCompute", LaOpBackward); NNVM_REGISTER_OP(_linalg_potri) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_potri) @@ -99,32 +103,70 @@ NNVM_REGISTER_OP(_linalg_inverse) NNVM_REGISTER_OP(_backward_linalg_inverse) .set_attr("FCompute", LaOpBackward); -NNVM_REGISTER_OP(_linalg_det).set_attr("FCompute", LaOpDetForward); +NNVM_REGISTER_OP(_linalg_det) + // Incompatibility comes from allocs made in linalg_batch_getrf(), called by det::op() + // see https://github.com/apache/incubator-mxnet/issues/19353 + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", LaOpDetForward); NNVM_REGISTER_OP(_backward_linalg_det) + // Incompatibility comes from allocs made in linalg_batch_getri(), + // called by linalg_batch_det_backward_helper, called by det_backward::op() + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpDetBackward); NNVM_REGISTER_OP(_linalg_slogdet) + // Incompatibility comes from allocs made in linalg_batch_getrf(), + // called by slogdet::op(). + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpDetForward); NNVM_REGISTER_OP(_backward_linalg_slogdet) + // Incompatibility comes from allocs made in linalg_batch_getri(), + // called by linalg_batch_det_backward_helper, called by slogdet_backward::op() + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpDetBackward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_linalg_potrf) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_potrf) .set_attr("FCompute", LaOpBackward); NNVM_REGISTER_OP(_linalg_gelqf) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_gelqf) .set_attr("FCompute", LaOpBackward); -NNVM_REGISTER_OP(_linalg_syevd).set_attr("FCompute", LaOpForwSyevd); +NNVM_REGISTER_OP(_linalg_syevd) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", LaOpForwSyevd); NNVM_REGISTER_OP(_backward_linalg_syevd) .set_attr("FCompute", LaOpBackwSyevd); From 4a2dae456758afa2fda0c1c90b7562ae44108a8f Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Fri, 28 May 2021 16:45:35 -0700 Subject: [PATCH 06/30] Marked 2 ops CUDA Graphs incompatible due to cpu->gpu copy --- src/operator/numpy/np_pad_op.cu | 16 ++++++++++++++-- src/operator/tensor/matrix_op.cu | 16 ++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/operator/numpy/np_pad_op.cu b/src/operator/numpy/np_pad_op.cu index 01a7035ab42d..97cb423b7bab 100644 --- a/src/operator/numpy/np_pad_op.cu +++ b/src/operator/numpy/np_pad_op.cu @@ -28,9 +28,21 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_npi_pad).set_attr("FCompute", NumpyPadOpForward); +NNVM_REGISTER_OP(_npi_pad) + // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpForward + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyPadOpForward); -NNVM_REGISTER_OP(_backward_npi_pad).set_attr("FCompute", NumpyPadOpBackward); +NNVM_REGISTER_OP(_backward_npi_pad) + // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpBackward + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", NumpyPadOpBackward); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index dc7d608dacb9..e26b767802a3 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -439,9 +439,21 @@ NNVM_REGISTER_OP(depth_to_space).set_attr("FCompute", DepthToSpac NNVM_REGISTER_OP(space_to_depth).set_attr("FCompute", SpaceToDepthOpForward); -NNVM_REGISTER_OP(_split_v2).set_attr("FCompute", SplitOpForwardGPU); +NNVM_REGISTER_OP(_split_v2) + // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpForwardImpl + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", SplitOpForwardGPU); -NNVM_REGISTER_OP(_split_v2_backward).set_attr("FCompute", SplitOpBackward); +NNVM_REGISTER_OP(_split_v2_backward) + // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpBackwardImpl + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) + .set_attr("FCompute", SplitOpBackward); } // namespace op } // namespace mxnet From 64e8555c454a60e79904be917ef9cf0793f1acd3 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Fri, 28 May 2021 21:56:14 -0700 Subject: [PATCH 07/30] Mark cuDNN Dropout as fully CUDA Graphs compatible. Reenable tests. --- src/operator/nn/dropout.cu | 16 ++++++++++++---- tests/python/gpu/test_gluon_gpu.py | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index bff9b020126c..5e8f55881af6 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -30,10 +30,18 @@ namespace op { NNVM_REGISTER_OP(Dropout) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool is_train) { - // Dropout is just passthrough during inference - return !is_train; - }) + [](const NodeAttrs& attrs, const bool is_train) { + // Dropout is just passthrough during inference for all impls + if (!is_train) + return true; + + // cuDNN impl is compatible during training as well + const DropoutParam& param = nnvm::get(attrs.parsed); + real_t pkeep = 1.0f - param.p; + bool cudnn_off = param.cudnn_off && param.cudnn_off.value(); + bool cudnn_available = pkeep > 0 && !cudnn_off; + return MXNET_USE_CUDNN_DROPOUT && cudnn_available; + }) .set_attr("FStatefulCompute", DropoutCompute); NNVM_REGISTER_OP(_backward_Dropout) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index f58322d101bb..af6805c9cd5c 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -633,7 +633,7 @@ def generate_inputs(self): TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))), TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')), - #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), + TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()), TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()), TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()), From 78215fa94425c56b26d43c9a4cd7a49181cbfb81 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Feb 2022 13:08:11 -0800 Subject: [PATCH 08/30] clang-tidy fixes --- src/imperative/attach_op_execs_pass.cc | 34 ++- src/imperative/cuda_graphs.h | 285 +++++++++--------- src/imperative/exec_pass.h | 11 +- src/imperative/imperative_utils.h | 8 +- src/operator/contrib/adamw.cu | 17 +- src/operator/contrib/index_array.cu | 4 +- src/operator/contrib/multi_lamb.cu | 8 +- src/operator/nn/dropout.cu | 24 +- src/operator/numpy/linalg/np_eig.cu | 8 +- src/operator/numpy/linalg/np_eigvals.cu | 8 +- src/operator/numpy/linalg/np_norm_backward.cu | 9 +- src/operator/numpy/linalg/np_norm_forward.cu | 9 +- src/operator/numpy/np_boolean_mask_assign.cu | 8 +- src/operator/numpy/np_constraint_check.cu | 4 +- src/operator/numpy/np_matrix_op.cu | 40 ++- src/operator/numpy/np_nonzero_op.cu | 4 +- src/operator/numpy/np_pad_op.cu | 8 +- src/operator/numpy/np_percentile_op.cu | 4 +- .../numpy/random/np_exponential_op.cu | 10 +- src/operator/numpy/random/np_gamma_op.cu | 4 +- .../numpy/random/np_multinomial_op.cu | 4 +- src/operator/numpy/random/np_normal_op.cu | 8 +- src/operator/numpy/random/np_pareto_op.cu | 4 +- src/operator/numpy/random/np_power_op.cu | 4 +- src/operator/numpy/random/np_rayleigh_op.cu | 4 +- .../tensor/elemwise_unary_op_basic.cu | 4 +- src/operator/tensor/indexing_op.cu | 4 +- src/operator/tensor/la_op.cu | 32 +- src/operator/tensor/matrix_op.cu | 16 +- 29 files changed, 260 insertions(+), 327 deletions(-) diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc index 719c9b3165fe..51736c577e5b 100644 --- a/src/imperative/attach_op_execs_pass.cc +++ b/src/imperative/attach_op_execs_pass.cc @@ -48,8 +48,8 @@ namespace exec { class StorageFallbackOpExecutor : public OpExecutor { public: explicit StorageFallbackOpExecutor(const NodeAttrs& attrs, - DispatchMode dispatch_mode, - std::vector mutate_idx) + DispatchMode dispatch_mode, + std::vector mutate_idx) : OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {} void Setup() override { @@ -253,10 +253,13 @@ class FComputeExExecutor : public OpExecutor { return exec_type_; } - explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode, - FComputeEx fcompute, ExecType exec_type) - : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} - : attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} + explicit FComputeExExecutor(const NodeAttrs& attrs, + DispatchMode dispatch_mode, + FComputeEx fcompute, + ExecType exec_type) + : OpExecutor(attrs, dispatch_mode), + fcompute_(std::move(fcompute)), + exec_type_(exec_type) {} private: FComputeEx fcompute_; @@ -312,8 +315,11 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, common::GetFCompute(op, "FStatefulComputeEx", vctx[i]); // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared( - inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type); + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], + state, + fcompute_ex, + exec_type); } else { FStatefulCompute fcompute = common::GetFCompute(op, "FStatefulCompute", vctx[i]); @@ -333,7 +339,10 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared(inode.source->attrs, - dispatch_modes[i], ret[fwd_id].get()->state(), fcompute_ex, exec_type); + dispatch_modes[i], + ret[fwd_id].get()->state(), + fcompute_ex, + exec_type); } else { FStatefulCompute fcompute = common::GetFCompute(op, "FStatefulCompute", vctx[i]); @@ -341,8 +350,11 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; ret[i] = std::make_shared(inode.source->attrs, - dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type, - mutate_index); + dispatch_modes[i], + ret[fwd_id].get()->state(), + fcompute, + exec_type, + mutate_index); } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 97424cfac074..a264a6edbfc4 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -78,62 +78,71 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { CUgraphNodeType t; CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); switch (t) { - case CU_GRAPH_NODE_TYPE_KERNEL: - { - CUDA_KERNEL_NODE_PARAMS kparams; - auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); - if (err == CUDA_SUCCESS) { - ss << "GPUKernel@" << kparams.func; - dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); - dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); - ss << "<<>>"; - ss << "(..."; - if (kparams.sharedMemBytes != 0) - ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; - ss << ")"; - } else { - ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; - } + case CU_GRAPH_NODE_TYPE_KERNEL: { + CUDA_KERNEL_NODE_PARAMS kparams; + auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); + if (err == CUDA_SUCCESS) { + ss << "GPUKernel@" << kparams.func; + dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); + dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); + ss << "<<>>"; + ss << "(..."; + if (kparams.sharedMemBytes != 0) + ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; + ss << ")"; + } else { + ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; } - break; - case CU_GRAPH_NODE_TYPE_MEMCPY: - { - cudaMemcpy3DParms mparams = {}; - CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); - // If memcpy is seen, return without setting up runnable executor - switch (mparams.kind) { - case cudaMemcpyHostToHost: ss << "Host->Host "; break; - case cudaMemcpyHostToDevice: ss << "Host->Device "; break; - case cudaMemcpyDeviceToHost: ss << "Device->Host "; break; - case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break; - default: break; - } - ss << "Memcpy"; + } break; + case CU_GRAPH_NODE_TYPE_MEMCPY: { + cudaMemcpy3DParms mparams = {}; + CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); + // If memcpy is seen, return without setting up runnable executor + switch (mparams.kind) { + case cudaMemcpyHostToHost: + ss << "Host->Host "; + break; + case cudaMemcpyHostToDevice: + ss << "Host->Device "; + break; + case cudaMemcpyDeviceToHost: + ss << "Device->Host "; + break; + case cudaMemcpyDeviceToDevice: + ss << "Device->Device "; + break; + default: break; } - break; - case CU_GRAPH_NODE_TYPE_MEMSET: - { - cudaMemsetParams mparams = {}; - CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); - if (mparams.height == 1 && mparams.elementSize == 1) { - ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value - << ", count=" << mparams.width << ")"; - } else { - if (mparams.elementSize == 1) - ss << "cudaMemset2D"; - else - ss << "MemSet"; - ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch - << ", value=" << mparams.value << ", width=" << mparams.width - << ", height=" << mparams.height << ")"; - } + ss << "Memcpy"; + } break; + case CU_GRAPH_NODE_TYPE_MEMSET: { + cudaMemsetParams mparams = {}; + CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); + if (mparams.height == 1 && mparams.elementSize == 1) { + ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value + << ", count=" << mparams.width << ")"; + } else { + if (mparams.elementSize == 1) + ss << "cudaMemset2D"; + else + ss << "MemSet"; + ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch + << ", value=" << mparams.value << ", width=" << mparams.width + << ", height=" << mparams.height << ")"; } + } break; + case CU_GRAPH_NODE_TYPE_HOST: + ss << "Host (executable) node"; + break; + case CU_GRAPH_NODE_TYPE_GRAPH: + ss << "Node which executes an embedded graph"; + break; + case CU_GRAPH_NODE_TYPE_EMPTY: + ss << "Empty (no-op) node"; break; - case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break; - case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break; - case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break; - default: ss << "Unknown/Invalid node type " << t; + default: + ss << "Unknown/Invalid node type " << t; } return ss.str(); } @@ -142,7 +151,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { // Function objects (preferred for readability) provide the deleter function. class CudaGraphDeleter { public: - void operator() (cudaGraph_t graph) { + void operator()(cudaGraph_t graph) { if (graph != nullptr) CUDA_CALL(cudaGraphDestroy(graph)); } @@ -152,7 +161,7 @@ class CudaGraphDeleter { // Function objects (preferred for readability) provide the deleter function. class CudaGraphExecDeleter { public: - void operator() (cudaGraphExec_t graph_exec) { + void operator()(cudaGraphExec_t graph_exec) { if (graph_exec != nullptr) CUDA_CALL(cudaGraphExecDestroy(graph_exec)); } @@ -162,25 +171,22 @@ class CudaGraphExecDeleter { // characterized by a starting index in the OpExecutor list and a number of ops. class CudaGraphsSubSegExec { public: - CudaGraphsSubSegExec(const std::vector > &exec_list, - const RunContext &rctx, + CudaGraphsSubSegExec(const std::vector >& exec_list, + const RunContext& rctx, bool is_gpu, bool verbose, int from_op_idx, int num_ops, - bool ops_are_cuda_graph_compatible = true) : - from_op_idx_(from_op_idx), - num_ops_(num_ops), - graph_(nullptr), - graph_exec_(nullptr) { + bool ops_are_cuda_graph_compatible = true) + : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) { if (ops_are_cuda_graph_compatible) { MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); MakeGraphExec(); } } - void Update(const std::vector > &exec_list, - const RunContext &rctx, + void Update(const std::vector >& exec_list, + const RunContext& rctx, bool is_gpu, bool verbose) { // Current executor should be Runnable with the same parameters @@ -189,8 +195,8 @@ class CudaGraphsSubSegExec { cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; cudaGraphNode_t error_node; - cudaError_t e = cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), - &error_node, &update_result); + cudaError_t e = + cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result); switch (e) { case cudaErrorGraphExecUpdateFailure: MakeGraphExec(); @@ -204,11 +210,11 @@ class CudaGraphsSubSegExec { } } - void RunSubSeg(const std::vector > &exec_list, - const RunContext &rctx, + void RunSubSeg(const std::vector >& exec_list, + const RunContext& rctx, bool is_gpu) { if (IsRunnable()) { - auto s = rctx.get_stream(); + auto s = rctx.get_stream(); const cudaStream_t cu_s = mshadow::Stream::GetStream(s); CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s)); } else { @@ -218,16 +224,18 @@ class CudaGraphsSubSegExec { } } - bool IsRunnable() { return graph_exec_ != nullptr; } + bool IsRunnable() { + return graph_exec_ != nullptr; + } private: - void MakeGraph(const std::vector > &exec_list, - const RunContext &rctx, - bool is_gpu, - bool verbose, - int from_op_idx, - int num_ops) { - auto s = rctx.get_stream(); + void MakeGraph(const std::vector >& exec_list, + const RunContext& rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops) { + auto s = rctx.get_stream(); const cudaStream_t cu_s = mshadow::Stream::GetStream(s); // Create CUDA Graph // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers @@ -242,38 +250,36 @@ class CudaGraphsSubSegExec { if (verbose) { std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); - size_t num_nodes = graph_nodes.size(); + size_t num_nodes = graph_nodes.size(); LOG(INFO) << " Graph has " << num_nodes << " nodes:"; for (size_t i = 0; i != num_nodes; ++i) { - LOG(INFO) << " node " << i << " = " - << CudaGraphNodeToString(graph_nodes[i]); + LOG(INFO) << " node " << i << " = " << CudaGraphNodeToString(graph_nodes[i]); } } } void MakeGraphExec() { - cudaGraphExec_t cuda_graph_exec; - cudaGraphNode_t error_node; - char log_buffer[1000]; - - CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), - &error_node, log_buffer, 1000)); - graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); - - // At this point we have a CUDA Graph executor - static int num_graph_creations_logged = 0; - static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); - if (num_graph_creations_logged < max_log_entries) { - num_graph_creations_logged++; - LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; - if (num_graph_creations_logged == max_log_entries) - LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; - } + cudaGraphExec_t cuda_graph_exec; + cudaGraphNode_t error_node; + char log_buffer[1000]; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), &error_node, log_buffer, 1000)); + graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); + + // At this point we have a CUDA Graph executor + static int num_graph_creations_logged = 0; + static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); + if (num_graph_creations_logged < max_log_entries) { + num_graph_creations_logged++; + LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; + if (num_graph_creations_logged == max_log_entries) + LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; + } } int from_op_idx_; int num_ops_; - using cudaGraphStruct_t = typename std::remove_pointer::type; + using cudaGraphStruct_t = typename std::remove_pointer::type; using cudaGraphExecStruct_t = typename std::remove_pointer::type; std::shared_ptr graph_; std::shared_ptr graph_exec_; @@ -283,7 +289,7 @@ class CudaGraphsSubSegExec { struct CudaGraphInfo { std::vector cuda_graph_subseg_execs; bool has_been_run_conventionally = false; - std::vector tempspace_dptrs; + std::vector tempspace_dptrs; }; // A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and // the state of the is_train flag of the OpContext. If the tempspace_dptrs change, we @@ -292,7 +298,7 @@ struct CudaGraphCacheKey { cudaStream_t cu_s; bool is_train; // overload '<' so CudaGraphCacheKey can be used as a std::map key - bool operator<(const CudaGraphCacheKey &other) const { + bool operator<(const CudaGraphCacheKey& other) const { return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train); } }; @@ -300,20 +306,20 @@ using CudaGraphCache = std::map; class CudaGraphsExec { public: - CudaGraphsExec(const std::vector > &exec_list, + CudaGraphsExec(const std::vector >& exec_list, bool is_gpu, - const char *opr_names) : - verbose_(false), is_enabled_(false) { + const char *opr_names) + : verbose_(false), is_enabled_(false) { opr_names_ = opr_names ? std::string(opr_names) : std::string(); if (is_gpu) { is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false); - verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); + verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); SetTempSpaces(exec_list); } } - void RunAll(const std::vector > &exec_list, - const RunContext &rctx, + void RunAll(const std::vector >& exec_list, + const RunContext& rctx, bool is_gpu) { // If this a CPU op or CUDA Graphs use isn't possible, run normally and return if (!is_gpu || !is_enabled_) { @@ -324,14 +330,14 @@ class CudaGraphsExec { // Also if we're in a warm-up period where tempspace pointers are likely // to change, run normally and return - auto s = rctx.get_stream(); + auto s = rctx.get_stream(); const cudaStream_t cu_s = mshadow::Stream::GetStream(s); // All the ops in the bulked segment will have the same setting of is_train as the first op - const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; + const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; const CudaGraphCacheKey key = {cu_s, is_train}; // Look-up the CUDA Graph info for this combo of stream and is_train setting // This may create a default-initialized new entry. - auto &cuda_graph_info = cache_[key]; + auto& cuda_graph_info = cache_[key]; if (!cuda_graph_info.has_been_run_conventionally) { // Run all opr in the sub-graph exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); @@ -347,7 +353,7 @@ class CudaGraphsExec { if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 && cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) { // Update all runnable executors. Non-runnable executors launch their ops conventionally. - for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { if (subseg_exec.IsRunnable()) subseg_exec.Update(exec_list, rctx, is_gpu, verbose_); } @@ -365,7 +371,9 @@ class CudaGraphsExec { break; } if (num_good_ops > 0) { - CreateSubExecOverRegion(exec_list, rctx, is_gpu, + CreateSubExecOverRegion(exec_list, + rctx, + is_gpu, first_op_idx, first_op_idx + num_good_ops, &cuda_graph_info.cuda_graph_subseg_execs); @@ -375,7 +383,7 @@ class CudaGraphsExec { // We had to have hit an op that was not OK. if (verbose_) { LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]" - << " of op segment " << opr_names_; + << " of op segment " << opr_names_; } CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false); cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg); @@ -393,19 +401,19 @@ class CudaGraphsExec { // Now execute the CUDA Graph that we either just created or looked-up in the cache. if (verbose_) { int runnable_execs = 0; - int bypassed_ops = 0; - for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + int bypassed_ops = 0; + for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { if (subseg_exec.IsRunnable()) runnable_execs++; else bypassed_ops++; } - LOG(INFO) << "Launching " << runnable_execs - << " captured CUDA graph(s) for op segment " << opr_names_; + LOG(INFO) << "Launching " << runnable_execs << " captured CUDA graph(s) for op segment " + << opr_names_; if (bypassed_ops > 0) LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; } - for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) + for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) subseg_exec.RunSubSeg(exec_list, rctx, is_gpu); } @@ -413,12 +421,12 @@ class CudaGraphsExec { // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a // number of smaller graphs that avoid those ops with the memcpys. - void CreateSubExecOverRegion(const std::vector > &exec_list, - const RunContext &rctx, + void CreateSubExecOverRegion(const std::vector >& exec_list, + const RunContext& rctx, bool is_gpu, size_t from_op_idx, size_t upto_op_idx, - std::vector *cuda_graph_subseg_execs) { + std::vector* cuda_graph_subseg_execs) { // Optimistically try to create a CUDA Graph of the entire op segment region int num_ops = upto_op_idx - from_op_idx; @@ -429,21 +437,20 @@ class CudaGraphsExec { if (verbose_) LOG(INFO) << " Graph was not runnable- creating op sub-segments..."; // Enter fall-back approach to making many sub-execs - for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) { + for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx;) { int num_good_ops = 0; for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) { CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1); if (single_opseg.IsRunnable()) num_good_ops++; // Is it time to create a subseg exec from accumulated good ops? - if (num_good_ops > 0 && - (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { + if (num_good_ops > 0 && (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { if (verbose_) - LOG(INFO) << "Capturing CUDA graph of op sub segment[" - << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]" + LOG(INFO) << "Capturing CUDA graph of op sub segment[" << first_op_idx << ":" + << (first_op_idx + num_good_ops - 1) << "]" << " of op segment " << opr_names_; - CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_, - first_op_idx, num_good_ops); + CudaGraphsSubSegExec good_opseg( + exec_list, rctx, is_gpu, verbose_, first_op_idx, num_good_ops); CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; cuda_graph_subseg_execs->push_back(good_opseg); first_op_idx += num_good_ops; @@ -452,7 +459,7 @@ class CudaGraphsExec { if (!single_opseg.IsRunnable()) { if (verbose_) { LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]" - << " of op segment " << opr_names_; + << " of op segment " << opr_names_; // Generate throw-away exec in order to produce a diagnostic listing of graph nodes CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1); } @@ -466,11 +473,11 @@ class CudaGraphsExec { } // Is the Op OK to make part of a CUDA Graph? - bool OpOK(const std::shared_ptr &exec) { - static auto& fstateful = Op::GetAttr("FCreateOpState"); + bool OpOK(const std::shared_ptr& exec) { + static auto& fstateful = Op::GetAttr("FCreateOpState"); static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); - static auto& fcompute_ex = Op::GetAttr("FComputeEx"); - const auto& attrs = exec->attrs; + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); + const auto& attrs = exec->attrs; if (attrs.op != nullptr) { const auto f = fgraphcompatible.get(attrs.op, nullptr); if (f != nullptr) { @@ -478,7 +485,7 @@ class CudaGraphsExec { } if (fstateful.get(attrs.op, nullptr) != nullptr) { if (verbose_) { - LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; + LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; } return false; } @@ -507,10 +514,10 @@ class CudaGraphsExec { } // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. - void SetTempSpaces(const std::vector > &exec_list) { + void SetTempSpaces(const std::vector >& exec_list) { // Gather info about the ops use of TempSpace. if (is_enabled_) { - std::set tempspaces_set; + std::set tempspaces_set; for (auto& exec : exec_list) { for (auto& resource : exec->op_ctx.requested) { if (resource.req.type == ResourceRequest::kTempSpace) { @@ -523,18 +530,18 @@ class CudaGraphsExec { } // Return the addresses of the gpu TempSpace areas - std::vector GetGPUTempspacePtrs(mshadow::Stream *s) { - std::vector ret; + std::vector GetGPUTempspacePtrs(mshadow::Stream* s) { + std::vector ret; for (const auto& resource : tempspaces_) { // Ask for minimal allocation to get base pointer without increasing the size - auto *base_ptr = resource->get_space_typed(mshadow::Shape1(1), s).dptr_; - ret.push_back(static_cast(base_ptr)); + auto* base_ptr = resource->get_space_typed(mshadow::Shape1(1), s).dptr_; + ret.push_back(static_cast(base_ptr)); } return ret; } CudaGraphCache cache_; - std::vector tempspaces_; + std::vector tempspaces_; std::string opr_names_; bool verbose_; bool is_enabled_; diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h index 6da6c8d95c69..1b9ef071f676 100644 --- a/src/imperative/exec_pass.h +++ b/src/imperative/exec_pass.h @@ -90,8 +90,8 @@ class OpExecutor { /*! \brief dispatch mode of the executor */ DispatchMode dispatch_mode; - explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode) : - attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {} + explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode) + : attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {} /*! \brief virtual destructor */ virtual ~OpExecutor() {} /*! @@ -111,9 +111,10 @@ class OpExecutor { * This function call does not synchronize the stream. * \param rctx The runtime context passed in by environment. */ - static void RunAll(const std::vector > &execs, - RunContext rctx, bool is_gpu) { - for (auto &exec : execs) + static void RunAll(const std::vector >& execs, + RunContext rctx, + bool is_gpu) { + for (auto& exec : execs) exec->Run(rctx, is_gpu); } /*! \return the execution type */ diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index bd9fe783ab28..7f90528f4793 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -1254,10 +1254,10 @@ inline Engine::OprHandle CreateEngineOp( // Provide initialized `cuda_graphs_exec`, which when captured // by exec_fun, acts like a static variable inside the mutable closure. cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names); - auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] ( - RunContext ctx, - Engine::CallbackOnStart on_start, - Engine::CallbackOnComplete on_complete) mutable { + auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu]( + RunContext ctx, + Engine::CallbackOnStart on_start, + Engine::CallbackOnComplete on_complete) mutable { on_start(); if (is_async) { execs[0]->op_ctx.async_on_complete = on_complete; diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu index 33fca6ccea67..0b247900ce4c 100644 --- a/src/operator/contrib/adamw.cu +++ b/src/operator/contrib/adamw.cu @@ -46,33 +46,24 @@ void GetScaleFloat(mshadow::Stream* s, const TBlob& scale_blob, float* NNVM_REGISTER_OP(_adamw_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", adamw::MPUpdate>); NNVM_REGISTER_OP(_mp_adamw_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", adamw::MPUpdate>); NNVM_REGISTER_OP(_multi_adamw_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", adamw::multiMPUpdate); NNVM_REGISTER_OP(_multi_mp_adamw_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", adamw::multiMPUpdate); - } // namespace adamw } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu index d317ccdf7059..3702fed2a06a 100644 --- a/src/operator/contrib/index_array.cu +++ b/src/operator/contrib/index_array.cu @@ -84,9 +84,7 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs& attrs, NNVM_REGISTER_OP(_contrib_index_array) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", IndexArrayForwardGPU); } // namespace op diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu index 8361e04cabb3..c6bedfc861f8 100644 --- a/src/operator/contrib/multi_lamb.cu +++ b/src/operator/contrib/multi_lamb.cu @@ -269,16 +269,12 @@ void CallKernel2(Stream* s, NNVM_REGISTER_OP(_multi_lamb_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", MultiLAMBUpdate); NNVM_REGISTER_OP(_multi_mp_lamb_update) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", MultiLAMBUpdate); } // namespace op diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index 5e8f55881af6..98cddbc97634 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -30,18 +30,20 @@ namespace op { NNVM_REGISTER_OP(Dropout) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool is_train) { - // Dropout is just passthrough during inference for all impls - if (!is_train) - return true; + [](const NodeAttrs& attrs, const bool is_train) { + // Dropout is a passthrough during inference for all impls + if (!is_train) + return true; - // cuDNN impl is compatible during training as well - const DropoutParam& param = nnvm::get(attrs.parsed); - real_t pkeep = 1.0f - param.p; - bool cudnn_off = param.cudnn_off && param.cudnn_off.value(); - bool cudnn_available = pkeep > 0 && !cudnn_off; - return MXNET_USE_CUDNN_DROPOUT && cudnn_available; - }) + // cuDNN impl is compatible during training as well + const DropoutParam& param = + nnvm::get(attrs.parsed); + real_t pkeep = 1.0f - param.p; + bool cudnn_off = + param.cudnn_off && param.cudnn_off.value(); + bool cudnn_available = pkeep > 0 && !cudnn_off; + return MXNET_USE_CUDNN_DROPOUT && cudnn_available; + }) .set_attr("FStatefulCompute", DropoutCompute); NNVM_REGISTER_OP(_backward_Dropout) diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu index ab5c6644501d..a217b6d4e0e7 100644 --- a/src/operator/numpy/linalg/np_eig.cu +++ b/src/operator/numpy/linalg/np_eig.cu @@ -30,18 +30,14 @@ namespace op { NNVM_REGISTER_OP(_npi_eig) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", EigOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigh) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", EighOpForward); #endif diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu index 94a007d7a245..be00d8c991d9 100644 --- a/src/operator/numpy/linalg/np_eigvals.cu +++ b/src/operator/numpy/linalg/np_eigvals.cu @@ -30,18 +30,14 @@ namespace op { NNVM_REGISTER_OP(_npi_eigvals) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", EigvalsOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigvalsh) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", EigvalshOpForward); #endif diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu index e88d717bd580..23a021d00ce5 100644 --- a/src/operator/numpy/linalg/np_norm_backward.cu +++ b/src/operator/numpy/linalg/np_norm_backward.cu @@ -27,10 +27,11 @@ namespace op { NNVM_REGISTER_OP(_backward_npi_norm) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - const NumpyNormParam& param = nnvm::get(attrs.parsed); - return param.axis.value().ndim() == 2; - }) + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = + nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeBackward); } // namespace op diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu index 26a87c580010..7399727324d0 100644 --- a/src/operator/numpy/linalg/np_norm_forward.cu +++ b/src/operator/numpy/linalg/np_norm_forward.cu @@ -27,10 +27,11 @@ namespace op { NNVM_REGISTER_OP(_npi_norm) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - const NumpyNormParam& param = nnvm::get(attrs.parsed); - return param.axis.value().ndim() == 2; - }) + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = + nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeForward); } // namespace op diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu index 8e1bd57386cd..216e8ff2b839 100644 --- a/src/operator/numpy/np_boolean_mask_assign.cu +++ b/src/operator/numpy/np_boolean_mask_assign.cu @@ -274,16 +274,12 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs, NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); } // namespace op diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu index 8d622d152bac..26a5f0178c0b 100644 --- a/src/operator/numpy/np_constraint_check.cu +++ b/src/operator/numpy/np_constraint_check.cu @@ -39,9 +39,7 @@ void GetReduceOutput(mshadow::Stream* s, const TBlob& output_blob, boo NNVM_REGISTER_OP(_npx_constraint_check) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", ConstraintCheckForward); } // namespace op diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu index ed9ab81f2066..27858988432d 100644 --- a/src/operator/numpy/np_matrix_op.cu +++ b/src/operator/numpy/np_matrix_op.cu @@ -54,16 +54,12 @@ NNVM_REGISTER_OP(_backward_np_column_stack) NNVM_REGISTER_OP(_npi_tril_indices) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", TrilindicesOpForward); NNVM_REGISTER_OP(_npi_roll) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", NumpyRollCompute); template <> @@ -104,16 +100,12 @@ void NumpyFlipForwardImpl(const OpContext& ctx, NNVM_REGISTER_OP(_npi_flip) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", NumpyFlipForward); NNVM_REGISTER_OP(_backward_npi_flip) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", NumpyFlipForward); NNVM_REGISTER_OP(_npi_moveaxis).set_attr("FCompute", NumpyMoveaxisCompute); @@ -125,17 +117,19 @@ NNVM_REGISTER_OP(_npi_rollaxis_backward) NNVM_REGISTER_OP(_npi_rot90) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - const auto& param = nnvm::get(attrs.parsed); - // Should track code in NumpyRot90Compute() - int real_k(param.k); - real_k = real_k % 4; - if (real_k < 0) { - real_k += 4; - } - // Avoid NumpyRot90ComputeFlipIml(), which uses mshadow::Copy() - return real_k != 2; - }) + [](const NodeAttrs& attrs, const bool) { + const auto& param = + nnvm::get(attrs.parsed); + // Should track code in NumpyRot90Compute() + int real_k(param.k); + real_k = real_k % 4; + if (real_k < 0) { + real_k += 4; + } + // Avoid NumpyRot90ComputeFlipIml(), + // which uses mshadow::Copy() + return real_k != 2; + }) .set_attr("FCompute", NumpyRot90Compute); NNVM_REGISTER_OP(_npi_hsplit).set_attr("FCompute", HSplitOpForward); diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu index 8ae39bdf2c4c..597331e458ff 100644 --- a/src/operator/numpy/np_nonzero_op.cu +++ b/src/operator/numpy/np_nonzero_op.cu @@ -116,9 +116,7 @@ NNVM_REGISTER_OP(_npx_nonzero) return std::vector{ResourceRequest::kTempSpace}; }) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs& attrs, const bool) { - return false; - }) + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FComputeEx", NonzeroForwardGPU); } // namespace op diff --git a/src/operator/numpy/np_pad_op.cu b/src/operator/numpy/np_pad_op.cu index 97cb423b7bab..1b9f4f4d5a86 100644 --- a/src/operator/numpy/np_pad_op.cu +++ b/src/operator/numpy/np_pad_op.cu @@ -31,17 +31,13 @@ namespace op { NNVM_REGISTER_OP(_npi_pad) // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpForward .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyPadOpForward); NNVM_REGISTER_OP(_backward_npi_pad) // Incompatible due to Copy(xpu_tensor, cpu_tensor) in NumpyPadOpBackward .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyPadOpBackward); } // namespace op diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu index 7ff67ad3c9c5..2dcc8294bb55 100644 --- a/src/operator/numpy/np_percentile_op.cu +++ b/src/operator/numpy/np_percentile_op.cu @@ -54,9 +54,7 @@ bool CheckInvalidInput(mshadow::Stream* s, NNVM_REGISTER_OP(_npi_percentile) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyPercentileForward); } // namespace op diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu index 7a2068dad8a1..c908c9bb3372 100644 --- a/src/operator/numpy/random/np_exponential_op.cu +++ b/src/operator/numpy/random/np_exponential_op.cu @@ -28,15 +28,9 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_exponential) -<<<<<<< HEAD - .set_attr("FCompute", NumpyExponentialForward); -======= .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) -.set_attr("FCompute", NumpyExponentialForward); ->>>>>>> f4bcd48dd... [1.x][FEATURE] CUDA graphs support (#19142) + [](const NodeAttrs&, const bool) { return false; }) + .set_attr("FCompute", NumpyExponentialForward); NNVM_REGISTER_OP(_backward_broadcast_exponential) .set_attr("FCompute", ExponentialReparamBackward); diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu index 8bfc61aad7ab..0191fd597ec6 100644 --- a/src/operator/numpy/random/np_gamma_op.cu +++ b/src/operator/numpy/random/np_gamma_op.cu @@ -30,9 +30,7 @@ namespace op { NNVM_REGISTER_OP(_npi_gamma) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyGammaForward); NNVM_REGISTER_OP(_backward_gamma_sample).set_attr("FCompute", NumpyGammaGrad); diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu index ee77b79c6d91..575ad08b8184 100644 --- a/src/operator/numpy/random/np_multinomial_op.cu +++ b/src/operator/numpy/random/np_multinomial_op.cu @@ -42,9 +42,7 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) { NNVM_REGISTER_OP(_npi_multinomial) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyMultinomialForward); } // namespace op diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu index 3d310f82b20d..525a0e14a4e4 100644 --- a/src/operator/numpy/random/np_normal_op.cu +++ b/src/operator/numpy/random/np_normal_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_normal) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyNormalForward); NNVM_REGISTER_OP(_backward_broadcast_normal) @@ -39,9 +37,7 @@ NNVM_REGISTER_OP(_backward_broadcast_normal) NNVM_REGISTER_OP(_npi_normal_n) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyNormalForward); } // namespace op diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu index 59900a0090e8..2948660f6e75 100644 --- a/src/operator/numpy/random/np_pareto_op.cu +++ b/src/operator/numpy/random/np_pareto_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_pareto) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyParetoForward); NNVM_REGISTER_OP(_backward_broadcast_pareto) diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu index f5bcb1000771..f7a6686769d0 100644 --- a/src/operator/numpy/random/np_power_op.cu +++ b/src/operator/numpy/random/np_power_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_powerd) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyPowerForward); } // namespace op diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu index c17d6c2bdd79..f67a2fe36ad7 100644 --- a/src/operator/numpy/random/np_rayleigh_op.cu +++ b/src/operator/numpy/random/np_rayleigh_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_rayleigh) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyRayleighForward); NNVM_REGISTER_OP(_backward_broadcast_rayleigh) diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu index baa36c6be587..5099301a1e4f 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cu +++ b/src/operator/tensor/elemwise_unary_op_basic.cu @@ -117,9 +117,7 @@ void ShapeComputeGPU(const nnvm::NodeAttrs& attrs, NNVM_REGISTER_OP(shape_array) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", ShapeComputeGPU); void SizeComputeGPU(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu index f4ce8ebcfd8a..992054f860ef 100644 --- a/src/operator/tensor/indexing_op.cu +++ b/src/operator/tensor/indexing_op.cu @@ -959,9 +959,7 @@ NNVM_REGISTER_OP(one_hot).set_attr("FCompute", OneHotOpForward("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", GatherNDForwardGPU); NNVM_REGISTER_OP(scatter_nd).set_attr("FCompute", ScatterNDForward); diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu index 8cae4a4d5be8..a32143a31d60 100644 --- a/src/operator/tensor/la_op.cu +++ b/src/operator/tensor/la_op.cu @@ -89,9 +89,7 @@ NNVM_REGISTER_OP(_backward_linalg_maketrian) NNVM_REGISTER_OP(_linalg_potri) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_potri) @@ -107,45 +105,35 @@ NNVM_REGISTER_OP(_linalg_det) // Incompatibility comes from allocs made in linalg_batch_getrf(), called by det::op() // see https://github.com/apache/incubator-mxnet/issues/19353 .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpDetForward); NNVM_REGISTER_OP(_backward_linalg_det) // Incompatibility comes from allocs made in linalg_batch_getri(), // called by linalg_batch_det_backward_helper, called by det_backward::op() .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpDetBackward); NNVM_REGISTER_OP(_linalg_slogdet) // Incompatibility comes from allocs made in linalg_batch_getrf(), // called by slogdet::op(). .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpDetForward); NNVM_REGISTER_OP(_backward_linalg_slogdet) // Incompatibility comes from allocs made in linalg_batch_getri(), // called by linalg_batch_det_backward_helper, called by slogdet_backward::op() .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpDetBackward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_linalg_potrf) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_potrf) @@ -153,9 +141,7 @@ NNVM_REGISTER_OP(_backward_linalg_potrf) NNVM_REGISTER_OP(_linalg_gelqf) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpForward); NNVM_REGISTER_OP(_backward_linalg_gelqf) @@ -163,9 +149,7 @@ NNVM_REGISTER_OP(_backward_linalg_gelqf) NNVM_REGISTER_OP(_linalg_syevd) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", LaOpForwSyevd); NNVM_REGISTER_OP(_backward_linalg_syevd) diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index e26b767802a3..00007bd2e602 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -414,16 +414,12 @@ NNVM_REGISTER_OP(_backward_tile).set_attr("FCompute", TileOpBackw NNVM_REGISTER_OP(reverse) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", ReverseOpForward); NNVM_REGISTER_OP(_backward_reverse) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", ReverseOpForward); NNVM_REGISTER_OP(stack).set_attr("FCompute", StackOpForward); @@ -442,17 +438,13 @@ NNVM_REGISTER_OP(space_to_depth).set_attr("FCompute", SpaceToDept NNVM_REGISTER_OP(_split_v2) // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpForwardImpl .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", SplitOpForwardGPU); NNVM_REGISTER_OP(_split_v2_backward) // Incompatible due to Copy(xpu_tensor, cpu_tensor) in SplitOpBackwardImpl .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", SplitOpBackward); } // namespace op From a55892277504c4f824df984950eea7454ce78f4b Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Feb 2022 15:29:25 -0800 Subject: [PATCH 09/30] More clang-tidy fixes --- include/mxnet/op_attr_types.h | 2 +- src/imperative/attach_op_execs_pass.cc | 11 +++-------- src/imperative/cuda_graphs.h | 9 +++++---- src/imperative/exec_pass.h | 2 +- src/operator/nn/dropout.cu | 1 - src/operator/numpy/random/np_bernoulli_op.cu | 4 +--- src/operator/numpy/random/np_exponential_op.cu | 4 ++-- src/operator/numpy/random/np_pareto_op.cu | 2 +- src/operator/numpy/random/np_weibull_op.cu | 4 +--- 9 files changed, 15 insertions(+), 24 deletions(-) diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index 73504bb2748b..c936d3e84afa 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -366,7 +366,7 @@ using FNeedCalibrateOutput = std::function(const NodeAttrs& att * to stay the same as long as the shape and type * of input stays the same. */ -using FIsCUDAGraphsCompatible = std::function; +using FIsCUDAGraphsCompatible = std::function; #endif diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc index 51736c577e5b..732391fdd747 100644 --- a/src/imperative/attach_op_execs_pass.cc +++ b/src/imperative/attach_op_execs_pass.cc @@ -257,9 +257,7 @@ class FComputeExExecutor : public OpExecutor { DispatchMode dispatch_mode, FComputeEx fcompute, ExecType exec_type) - : OpExecutor(attrs, dispatch_mode), - fcompute_(std::move(fcompute)), - exec_type_(exec_type) {} + : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {} private: FComputeEx fcompute_; @@ -315,11 +313,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, common::GetFCompute(op, "FStatefulComputeEx", vctx[i]); // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared(inode.source->attrs, - dispatch_modes[i], - state, - fcompute_ex, - exec_type); + ret[i] = std::make_shared( + inode.source->attrs, dispatch_modes[i], state, fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute(op, "FStatefulCompute", vctx[i]); diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index a264a6edbfc4..9c64972e9f2c 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -112,7 +112,8 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break; - default: break; + default: + break; } ss << "Memcpy"; } break; @@ -178,7 +179,7 @@ class CudaGraphsSubSegExec { int from_op_idx, int num_ops, bool ops_are_cuda_graph_compatible = true) - : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) { + : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) { if (ops_are_cuda_graph_compatible) { MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); MakeGraphExec(); @@ -308,7 +309,7 @@ class CudaGraphsExec { public: CudaGraphsExec(const std::vector >& exec_list, bool is_gpu, - const char *opr_names) + const char* opr_names) : verbose_(false), is_enabled_(false) { opr_names_ = opr_names ? std::string(opr_names) : std::string(); if (is_gpu) { @@ -448,7 +449,7 @@ class CudaGraphsExec { if (verbose_) LOG(INFO) << "Capturing CUDA graph of op sub segment[" << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]" - << " of op segment " << opr_names_; + << " of op segment " << opr_names_; CudaGraphsSubSegExec good_opseg( exec_list, rctx, is_gpu, verbose_, first_op_idx, num_good_ops); CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; diff --git a/src/imperative/exec_pass.h b/src/imperative/exec_pass.h index 1b9ef071f676..02fa967a19b7 100644 --- a/src/imperative/exec_pass.h +++ b/src/imperative/exec_pass.h @@ -111,7 +111,7 @@ class OpExecutor { * This function call does not synchronize the stream. * \param rctx The runtime context passed in by environment. */ - static void RunAll(const std::vector >& execs, + static void RunAll(const std::vector>& execs, RunContext rctx, bool is_gpu) { for (auto& exec : execs) diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index 98cddbc97634..6b4bfc8aebd2 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -34,7 +34,6 @@ NNVM_REGISTER_OP(Dropout) // Dropout is a passthrough during inference for all impls if (!is_train) return true; - // cuDNN impl is compatible during training as well const DropoutParam& param = nnvm::get(attrs.parsed); diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu index 0cf9bd95ab7c..eee89c1ea8d4 100644 --- a/src/operator/numpy/random/np_bernoulli_op.cu +++ b/src/operator/numpy/random/np_bernoulli_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_bernoulli) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyBernoulliForward); } // namespace op diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu index c908c9bb3372..8ad738639eae 100644 --- a/src/operator/numpy/random/np_exponential_op.cu +++ b/src/operator/numpy/random/np_exponential_op.cu @@ -28,8 +28,8 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_exponential) -.set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { return false; }) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyExponentialForward); NNVM_REGISTER_OP(_backward_broadcast_exponential) diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu index 2948660f6e75..82fcd1f4d066 100644 --- a/src/operator/numpy/random/np_pareto_op.cu +++ b/src/operator/numpy/random/np_pareto_op.cu @@ -29,7 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_pareto) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { return false; }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyParetoForward); NNVM_REGISTER_OP(_backward_broadcast_pareto) diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu index 62c0a564c060..4495bab39206 100644 --- a/src/operator/numpy/random/np_weibull_op.cu +++ b/src/operator/numpy/random/np_weibull_op.cu @@ -29,9 +29,7 @@ namespace op { NNVM_REGISTER_OP(_npi_weibull) .set_attr("FIsCUDAGraphsCompatible", - [](const NodeAttrs&, const bool) { - return false; - }) + [](const NodeAttrs&, const bool) { return false; }) .set_attr("FCompute", NumpyWeibullForward); NNVM_REGISTER_OP(_backward_broadcast_weibull) From eaa7fc7eb70043d8575bc6235f9abb6dfddc7582 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Feb 2022 17:50:49 -0800 Subject: [PATCH 10/30] Avoid CUDA_CALL(e): improper macro expansion --- src/imperative/cuda_graphs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 9c64972e9f2c..8a8a06fa6b6e 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -196,9 +196,9 @@ class CudaGraphsSubSegExec { cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; cudaGraphNode_t error_node; - cudaError_t e = + cudaError_t err = cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result); - switch (e) { + switch (err) { case cudaErrorGraphExecUpdateFailure: MakeGraphExec(); break; @@ -207,7 +207,7 @@ class CudaGraphsSubSegExec { break; default: // Respond normally to unusual cudaGraphExecUpdate() ret vals - CUDA_CALL(e); + CUDA_CALL(err); } } From c44cfc64158a24faaa6f4de232085fda00086187 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 17 Feb 2022 19:39:17 -0800 Subject: [PATCH 11/30] Add compile guard to Dropout's FIsCUDAGraphsCompatible def --- src/operator/nn/dropout.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index 6b4bfc8aebd2..414b82edcc65 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -34,6 +34,7 @@ NNVM_REGISTER_OP(Dropout) // Dropout is a passthrough during inference for all impls if (!is_train) return true; +#if MXNET_USE_CUDNN_DROPOUT // cuDNN impl is compatible during training as well const DropoutParam& param = nnvm::get(attrs.parsed); @@ -41,7 +42,10 @@ NNVM_REGISTER_OP(Dropout) bool cudnn_off = param.cudnn_off && param.cudnn_off.value(); bool cudnn_available = pkeep > 0 && !cudnn_off; - return MXNET_USE_CUDNN_DROPOUT && cudnn_available; + return cudnn_available; +#else + return false; +#endif // MXNET_USE_CUDNN_DROPOUT }) .set_attr("FStatefulCompute", DropoutCompute); From 5a2f847558a7f55790f1ad1fb5ee930b4ad1a3a9 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Fri, 18 Feb 2022 00:37:34 -0800 Subject: [PATCH 12/30] Temporarily add '-s' to pytest serial tests --- ci/docker/runtime_functions.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 05f80032cd15..69453c39e538 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -801,7 +801,8 @@ cd_unittest_ubuntu() { local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"} OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest - pytest -m 'serial' --durations=50 --verbose tests/python/unittest + # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error + pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest # https://github.com/apache/incubator-mxnet/issues/11801 # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then From 3b58b49e3d95b815f0ea6b3dd7849101630ac04e Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sat, 26 Feb 2022 23:12:55 -0800 Subject: [PATCH 13/30] Fix DropoutOp.dropout_passthrough_ handling for CUDA Graphs --- src/operator/nn/dropout-inl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 18f94cffd25b..0baa8e40c397 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -437,7 +437,6 @@ class DropoutOp { using namespace mshadow::expr; Stream* s = ctx.get_stream(); if (!this->dropout_passthrough_) { - this->dropout_passthrough_ = true; const TBlob& gdata = in_grad[dropout::kData]; const TBlob& grad = out_grad[dropout::kOut]; const TBlob& mask = out_data[dropout::kMask]; From 0d620833d02a41fb218cd2526de903db48fa2877 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 27 Feb 2022 14:34:27 -0800 Subject: [PATCH 14/30] Adapt test_gluon_gpu.py::test_cuda_graphs for gluon2.0 --- tests/python/gpu/test_gluon_gpu.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index af6805c9cd5c..20a7f26e8686 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -596,21 +596,22 @@ def test_cudnn_dropout_reproducibility(): assert_almost_equal(a.grad, b.grad) +@mx.util.use_np def test_cuda_graphs(): class GraphTester(gluon.HybridBlock): def __init__(self, function_to_test, **kwargs): super(GraphTester, self).__init__(**kwargs) self.f = function_to_test() - def hybrid_forward(self, F, *args): + def forward(self, *args): # We need to isolate the operation to be fully inside the graph # in order for graphs usage to be possible - copied_args = [F.identity(a) for a in args] + copied_args = [mx.np.copy(a) for a in args] outputs = self.f(*copied_args) if isinstance(outputs, (list, tuple)): - return [F.identity(o) for o in outputs] + return [mx.np.copy(o) for o in outputs] else: - return F.identity(outputs) + return mx.np.copy(outputs) class TestDesc: def __init__(self, name, f, num_inputs=1, input_dim=4): @@ -620,8 +621,8 @@ def __init__(self, name, f, num_inputs=1, input_dim=4): self.input_dim = input_dim def generate_inputs(self): - shape = tuple(np.random.randint(4, 11, size=self.input_dim)) - ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)] + shape = tuple(_np.random.randint(4, 11, size=self.input_dim)) + ret = [mx.np.random.uniform(size=shape) for _ in range(self.num_inputs)] for r in ret: r.attach_grad() return ret @@ -654,6 +655,7 @@ def generate_inputs(self): with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', 'MXNET_USE_FUSION': '0'}): + device = mx.gpu(0) for test_desc in tested_ops: print("Testing ", test_desc.name) inputs = test_desc.generate_inputs() @@ -665,8 +667,8 @@ def generate_inputs(self): netg = GraphTester(test_desc.f) # initialize parameters - net.initialize() - netg.initialize() + net.initialize(device=device) + netg.initialize(device=device) net(*inputs) @@ -680,7 +682,7 @@ def generate_inputs(self): for _ in range(N): assert_almost_equal(net(*inputs), netg(*inputsg)) - mx.nd.waitall() + mx.npx.waitall() print("Testing training mode") for _ in range(N): with random_seed(seed): @@ -701,4 +703,4 @@ def generate_inputs(self): assert_almost_equal(p1.data(), p2.data()) if p1.grad_req != 'null': assert_almost_equal(p1.grad(), p2.grad()) - mx.nd.waitall() + mx.npx.waitall() From 3591f509f2f28e7a6e3166a25bf65fcce17631e3 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 27 Feb 2022 15:27:18 -0800 Subject: [PATCH 15/30] Create CUDA Graph 'dot' files if MXNET_CUDA_GRAPHS_DBG_FILE= --- docs/static_site/src/pages/api/faq/env_var.md | 6 ++ src/imperative/cuda_graphs.h | 59 ++++++++++++++----- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md index 1a4421d2e50f..8e12b48aac76 100644 --- a/docs/static_site/src/pages/api/faq/env_var.md +++ b/docs/static_site/src/pages/api/faq/env_var.md @@ -180,6 +180,12 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 * MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES - Values: Int ```(default=0)``` - The maximum number of log messages generated by CUDA graphs executor. +* MXNET_CUDA_GRAPHS_DBG_FILE + - Values: String ```(default='', to indicate no debug dot files should be created)``` + - The file prefix for '.dot' files for each graph created. Full path is -devN-{trn,inf}..dot . +* MXNET_CUDA_GRAPHS_DBG_FILE_FLAGS + - Values: Int ```(default=)``` + - A bitmask to enable various types of info in the debug '.dot' files. See cudaGraphDebugDotFlags in the CUDA runtime API doc for details. ## Control the Data Communication diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 8a8a06fa6b6e..5c954011e4d2 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include "./exec_pass.h" #include "../common/cuda/utils.h" @@ -179,10 +180,11 @@ class CudaGraphsSubSegExec { int from_op_idx, int num_ops, bool ops_are_cuda_graph_compatible = true) - : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr) { + : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr), + graph_exec_id_(0) { if (ops_are_cuda_graph_compatible) { MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); - MakeGraphExec(); + MakeGraphExec(exec_list, rctx); } } @@ -200,7 +202,7 @@ class CudaGraphsSubSegExec { cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), &error_node, &update_result); switch (err) { case cudaErrorGraphExecUpdateFailure: - MakeGraphExec(); + MakeGraphExec(exec_list, rctx); break; case cudaSuccess: CHECK_EQ(update_result, cudaGraphExecUpdateSuccess); @@ -229,6 +231,12 @@ class CudaGraphsSubSegExec { return graph_exec_ != nullptr; } + int NumGraphNodes() { + size_t numNodes; + CUDA_CALL(cudaGraphGetNodes(graph_.get(), static_cast(nullptr), &numNodes)); + return numNodes; + } + private: void MakeGraph(const std::vector >& exec_list, const RunContext& rctx, @@ -259,7 +267,9 @@ class CudaGraphsSubSegExec { } } - void MakeGraphExec() { + void MakeGraphExec(const std::vector>& exec_list, + const RunContext& rctx) { + // Note that this routine is not invoked when a graph executor is merely updated. cudaGraphExec_t cuda_graph_exec; cudaGraphNode_t error_node; char log_buffer[1000]; @@ -268,14 +278,34 @@ class CudaGraphsSubSegExec { graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); // At this point we have a CUDA Graph executor - static int num_graph_creations_logged = 0; - static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); - if (num_graph_creations_logged < max_log_entries) { - num_graph_creations_logged++; - LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; - if (num_graph_creations_logged == max_log_entries) + static int num_graph_creations = 0; + graph_exec_id_ = num_graph_creations++; + + static size_t max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); + if (graph_exec_id_ < max_log_entries) { + LOG(INFO) << "Created CUDA graph " << graph_exec_id_; + if (num_graph_creations == max_log_entries) LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; } + // Create a .dot file for graph visualization if requested + static std::string dotfile_base = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE", std::string()); + if (dotfile_base.size() > 0) { +#if CUDA_VERSION >= 11030 + static int dotfile_flags = dmlc::GetEnv("MXNET_CUDA_GRAPHS_DBG_FILE_FLAGS", + static_cast(cudaGraphDebugDotFlagsVerbose)); + std::ostringstream filename; + const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; + int dev_id = rctx.ctx.dev_id; + filename << dotfile_base << "-" << "dev" << dev_id << "-" << (is_train ? "trn" : "inf") + << "-" << graph_exec_id_ << ".dot"; + CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags)); +#else + static bool dot_file_unsupported = []() { + LOG(INFO) << "MXNET_CUDA_GRAPHS_DBG_FILE setting ignored- requires CUDA version >= 11.3"; + return true; + }(); +#endif // CUDA_VERSION >= 11030 + } } int from_op_idx_; @@ -284,6 +314,7 @@ class CudaGraphsSubSegExec { using cudaGraphExecStruct_t = typename std::remove_pointer::type; std::shared_ptr graph_; std::shared_ptr graph_exec_; + size_t graph_exec_id_; }; // The CudaGraph executor and associated Tempspace ptrs for which it is valid. @@ -404,13 +435,13 @@ class CudaGraphsExec { int runnable_execs = 0; int bypassed_ops = 0; for (auto& subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { - if (subseg_exec.IsRunnable()) + if (subseg_exec.IsRunnable()) { + LOG(INFO) << "Launching captured graph with " << subseg_exec.NumGraphNodes() << " nodes."; runnable_execs++; - else + } else { bypassed_ops++; + } } - LOG(INFO) << "Launching " << runnable_execs << " captured CUDA graph(s) for op segment " - << opr_names_; if (bypassed_ops > 0) LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; } From 0e105ec6d189abe9b9010cb78cf5d22fa827d9b9 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 27 Feb 2022 15:36:38 -0800 Subject: [PATCH 16/30] Fix clang-tidy --- src/imperative/cuda_graphs.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 5c954011e4d2..25fd003b7eaf 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -173,14 +173,17 @@ class CudaGraphExecDeleter { // characterized by a starting index in the OpExecutor list and a number of ops. class CudaGraphsSubSegExec { public: - CudaGraphsSubSegExec(const std::vector >& exec_list, + CudaGraphsSubSegExec(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu, bool verbose, int from_op_idx, int num_ops, bool ops_are_cuda_graph_compatible = true) - : from_op_idx_(from_op_idx), num_ops_(num_ops), graph_(nullptr), graph_exec_(nullptr), + : from_op_idx_(from_op_idx), + num_ops_(num_ops), + graph_(nullptr), + graph_exec_(nullptr), graph_exec_id_(0) { if (ops_are_cuda_graph_compatible) { MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); @@ -188,7 +191,7 @@ class CudaGraphsSubSegExec { } } - void Update(const std::vector >& exec_list, + void Update(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu, bool verbose) { @@ -213,7 +216,7 @@ class CudaGraphsSubSegExec { } } - void RunSubSeg(const std::vector >& exec_list, + void RunSubSeg(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu) { if (IsRunnable()) { @@ -238,7 +241,7 @@ class CudaGraphsSubSegExec { } private: - void MakeGraph(const std::vector >& exec_list, + void MakeGraph(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu, bool verbose, @@ -279,7 +282,7 @@ class CudaGraphsSubSegExec { // At this point we have a CUDA Graph executor static int num_graph_creations = 0; - graph_exec_id_ = num_graph_creations++; + graph_exec_id_ = num_graph_creations++; static size_t max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); if (graph_exec_id_ < max_log_entries) { @@ -295,9 +298,10 @@ class CudaGraphsSubSegExec { static_cast(cudaGraphDebugDotFlagsVerbose)); std::ostringstream filename; const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; - int dev_id = rctx.ctx.dev_id; - filename << dotfile_base << "-" << "dev" << dev_id << "-" << (is_train ? "trn" : "inf") - << "-" << graph_exec_id_ << ".dot"; + int dev_id = rctx.ctx.dev_id; + filename << dotfile_base << "-" + << "dev" << dev_id << "-" << (is_train ? "trn" : "inf") << "-" << graph_exec_id_ + << ".dot"; CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags)); #else static bool dot_file_unsupported = []() { @@ -338,7 +342,7 @@ using CudaGraphCache = std::map; class CudaGraphsExec { public: - CudaGraphsExec(const std::vector >& exec_list, + CudaGraphsExec(const std::vector>& exec_list, bool is_gpu, const char* opr_names) : verbose_(false), is_enabled_(false) { @@ -350,7 +354,7 @@ class CudaGraphsExec { } } - void RunAll(const std::vector >& exec_list, + void RunAll(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu) { // If this a CPU op or CUDA Graphs use isn't possible, run normally and return @@ -546,7 +550,7 @@ class CudaGraphsExec { } // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. - void SetTempSpaces(const std::vector >& exec_list) { + void SetTempSpaces(const std::vector>& exec_list) { // Gather info about the ops use of TempSpace. if (is_enabled_) { std::set tempspaces_set; From d8d65c9ff094bd8a01e85ec36e98d63a1c69f8f0 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 27 Feb 2022 15:40:19 -0800 Subject: [PATCH 17/30] Fix more clang-tidy --- src/imperative/cuda_graphs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 25fd003b7eaf..f7c1663c4efa 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -457,7 +457,7 @@ class CudaGraphsExec { // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a // number of smaller graphs that avoid those ops with the memcpys. - void CreateSubExecOverRegion(const std::vector >& exec_list, + void CreateSubExecOverRegion(const std::vector>& exec_list, const RunContext& rctx, bool is_gpu, size_t from_op_idx, From 26182fb501eb5bb40df2c76ff53aab8ab55d3e0e Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 16 Feb 2022 22:40:35 -0800 Subject: [PATCH 18/30] Skip test_np_standard_binary_funcs test of 0-dim array broadcast --- tests/python/unittest/test_numpy_op.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 8008c053cd2b..806fb130ab27 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -11712,8 +11712,12 @@ def array_values(low, high, shape): ((3, 1), (3, 0)), ((0, 2), (1, 2)), ((2, 3, 4), (3, 1)), - ((2, 3), ()), - ((), (2, 3)) +# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays. +# See https://github.com/apache/incubator-mxnet/issues/20898. +# ((2, 3), ()), +# ((), (2, 3)) + ((2, 3), (1,)), + ((1,), (2, 3)) ]) def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape): class TestStandardBinary(HybridBlock): From 6cc8ab8d37b3eb1c1849e4b625f2a93c5bb2756e Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 20 Feb 2022 20:44:57 -0800 Subject: [PATCH 19/30] Improve test_rnn_layers_fp{16,32} invocation --- tests/python/unittest/test_gluon_rnn.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py index 2911f9165244..7f86034fe0b8 100644 --- a/tests/python/unittest/test_gluon_rnn.py +++ b/tests/python/unittest/test_gluon_rnn.py @@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m @mx.util.use_np -def run_rnn_layers(dtype, dtype2, device=mx.cpu()): +def run_rnn_layers(dtype, dtype2): + device = default_device() check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device) check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), device=device) @@ -673,10 +674,12 @@ def test_rnn_layers_fp32(): run_rnn_layers('float32', 'float32') @assert_raises_cudnn_not_satisfied(min_version='5.1.10') -@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now") @pytest.mark.serial def test_rnn_layers_fp16(): - run_rnn_layers('float16', 'float32', mx.gpu()) + # Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf + if default_device().device_type == 'cpu': + pytest.skip('RNN FP16 only implemented for GPU for now') + run_rnn_layers('float16', 'float32') def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4): From d06b1395a4ab37db2f9aef7155c232def223a5f6 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 21 Feb 2022 10:42:28 -0800 Subject: [PATCH 20/30] Run test_rnn_layers_fp32 only when cuDNN is present --- tests/python/unittest/test_gluon_rnn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py index 7f86034fe0b8..c65a82ca539b 100644 --- a/tests/python/unittest/test_gluon_rnn.py +++ b/tests/python/unittest/test_gluon_rnn.py @@ -669,6 +669,7 @@ def run_rnn_layers(dtype, dtype2): out.backward() out = out.asnumpy() +@assert_raises_cudnn_not_satisfied(min_version='5.1.10') @pytest.mark.serial def test_rnn_layers_fp32(): run_rnn_layers('float32', 'float32') From c5198c2914e686018a8b1ee0848f0dc3772e3133 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 21 Feb 2022 16:58:32 -0800 Subject: [PATCH 21/30] Fix potential out-of-bounds write in count_sketch.cu --- src/operator/contrib/count_sketch.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu index 24ca7970e064..bb16695caa74 100644 --- a/src/operator/contrib/count_sketch.cu +++ b/src/operator/contrib/count_sketch.cu @@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads, // only calculate gradient regarding x // can also calculate gradient regarding s if needed const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= nthreads) { + return; + } const int i_indim = index % in_dim; const int i_sample = index / in_dim; const int i_outdim = i_sample * out_dim + h[i_indim]; From e013a85ea599fa761cb98762f11feab6e7d74049 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 1 Mar 2022 08:56:49 -0800 Subject: [PATCH 22/30] Add temp output to debug centos crash --- tests/python/gpu/test_gluon_gpu.py | 95 +++++++++++++++--------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 20a7f26e8686..46e937377bc6 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -597,7 +597,8 @@ def test_cudnn_dropout_reproducibility(): assert_almost_equal(a.grad, b.grad) @mx.util.use_np -def test_cuda_graphs(): +@pytest.mark.serial +def test_cuda_graphs(capsys): class GraphTester(gluon.HybridBlock): def __init__(self, function_to_test, **kwargs): super(GraphTester, self).__init__(**kwargs) @@ -653,54 +654,56 @@ def generate_inputs(self): N = 10 - with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', - 'MXNET_USE_FUSION': '0'}): - device = mx.gpu(0) - for test_desc in tested_ops: - print("Testing ", test_desc.name) - inputs = test_desc.generate_inputs() - inputsg = [i.copy() for i in inputs] - for i in inputsg: - i.attach_grad() - seed = random.randint(0, 10000) - net = GraphTester(test_desc.f) - netg = GraphTester(test_desc.f) - - # initialize parameters - net.initialize(device=device) - netg.initialize(device=device) - - net(*inputs) + with capsys.disabled(): + with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', + 'MXNET_CUDA_GRAPHS_VERBOSE': '1', + 'MXNET_USE_FUSION': '0'}): + device = mx.gpu(0) + for test_desc in tested_ops: + sys.stdout.write('Testing {}\n'.format( test_desc.name)) + inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) + + # initialize parameters + net.initialize(device=device) + netg.initialize(device=device) + + net(*inputs) - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - p2.set_data(p1.data()) - - netg.hybridize(static_alloc=True, static_shape=True) - - print("Testing inference mode") - with random_seed(seed): - for _ in range(N): - assert_almost_equal(net(*inputs), netg(*inputsg)) + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) - mx.npx.waitall() - print("Testing training mode") - for _ in range(N): - with random_seed(seed): - with mx.autograd.record(): - out = net(*inputs) - out.backward() + netg.hybridize(static_alloc=True, static_shape=True) + print(" Testing inference mode") with random_seed(seed): - with mx.autograd.record(): - outg = netg(*inputsg) - outg.backward() + for _ in range(N): + assert_almost_equal(net(*inputs), netg(*inputsg)) - assert_almost_equal(out, outg) - for i, ig in zip(inputs, inputsg): - assert_almost_equal(i.grad, ig.grad) - - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - assert_almost_equal(p1.data(), p2.data()) - if p1.grad_req != 'null': - assert_almost_equal(p1.grad(), p2.grad()) + mx.npx.waitall() + print(" Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) mx.npx.waitall() From 7651c97310481d485b9e320b1bd2811d14b2692c Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 3 Mar 2022 22:10:28 -0800 Subject: [PATCH 23/30] Mark InstanceNorm and LeakyRELU as not CUDA Graphs compatible --- src/operator/instance_norm.cu | 7 ++++++- src/operator/leaky_relu.cu | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/operator/instance_norm.cu b/src/operator/instance_norm.cu index ca45dbbff386..ce11fbf3200d 100644 --- a/src/operator/instance_norm.cu +++ b/src/operator/instance_norm.cu @@ -28,9 +28,14 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(InstanceNorm).set_attr("FCompute", InstanceNormForward); +NNVM_REGISTER_OP(InstanceNorm) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { return false; }) + .set_attr("FCompute", InstanceNormForward); NNVM_REGISTER_OP(_backward_instance_norm) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", InstanceNormBackward); } // namespace op diff --git a/src/operator/leaky_relu.cu b/src/operator/leaky_relu.cu index d461949ed225..82ec59bfe907 100644 --- a/src/operator/leaky_relu.cu +++ b/src/operator/leaky_relu.cu @@ -28,9 +28,14 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(LeakyReLU).set_attr("FCompute", LeakyReLUCompute); +NNVM_REGISTER_OP(LeakyReLU) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { return false; }) + .set_attr("FCompute", LeakyReLUCompute); NNVM_REGISTER_OP(_backward_LeakyReLU) + .set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { return false; }) .set_attr("FCompute", LeakyReLUGradCompute); } // namespace op From e70402220237a5b82c74c0a0519a94d9b6cf3921 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 3 Mar 2022 22:13:33 -0800 Subject: [PATCH 24/30] Ops calling FStatefulCompute* are not CUDA Graphs compatible by default --- src/imperative/cuda_graphs.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index f7c1663c4efa..d087acc6888f 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -510,16 +510,21 @@ class CudaGraphsExec { // Is the Op OK to make part of a CUDA Graph? bool OpOK(const std::shared_ptr& exec) { - static auto& fstateful = Op::GetAttr("FCreateOpState"); - static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); - static auto& fcompute_ex = Op::GetAttr("FComputeEx"); - const auto& attrs = exec->attrs; + static auto& fgraphcompatible = + Op::GetAttr("FIsCUDAGraphsCompatible"); + static auto& fstateful = Op::GetAttr("FCreateOpState"); + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); + static auto& fstatefulcompute = Op::GetAttr("FStatefulCompute"); + static auto& fstatefulcompute_ex = Op::GetAttr("FStatefulComputeEx"); + const auto& attrs = exec->attrs; if (attrs.op != nullptr) { const auto f = fgraphcompatible.get(attrs.op, nullptr); if (f != nullptr) { return f(attrs, exec->op_ctx.is_train); } - if (fstateful.get(attrs.op, nullptr) != nullptr) { + if (fstateful.get(attrs.op, nullptr) != nullptr || + fstatefulcompute.get(attrs.op, nullptr) != nullptr || + fstatefulcompute_ex.get(attrs.op, nullptr) != nullptr) { if (verbose_) { LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; } From da59cff1df9095809940560d66bf4c33593bdfc8 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 3 Mar 2022 23:34:56 -0800 Subject: [PATCH 25/30] Fix clang-tidy --- src/imperative/cuda_graphs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index d087acc6888f..ee2ba00aea84 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -510,11 +510,10 @@ class CudaGraphsExec { // Is the Op OK to make part of a CUDA Graph? bool OpOK(const std::shared_ptr& exec) { - static auto& fgraphcompatible = - Op::GetAttr("FIsCUDAGraphsCompatible"); - static auto& fstateful = Op::GetAttr("FCreateOpState"); - static auto& fcompute_ex = Op::GetAttr("FComputeEx"); - static auto& fstatefulcompute = Op::GetAttr("FStatefulCompute"); + static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); + static auto& fstateful = Op::GetAttr("FCreateOpState"); + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); + static auto& fstatefulcompute = Op::GetAttr("FStatefulCompute"); static auto& fstatefulcompute_ex = Op::GetAttr("FStatefulComputeEx"); const auto& attrs = exec->attrs; if (attrs.op != nullptr) { From 45bb7b8403bc05b6f40603e78ce0c38f181e25d9 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 9 Mar 2022 16:46:45 -0800 Subject: [PATCH 26/30] Revert "Add temp output to debug centos crash" This reverts commit e013a85ea599fa761cb98762f11feab6e7d74049. --- tests/python/gpu/test_gluon_gpu.py | 95 +++++++++++++++--------------- 1 file changed, 46 insertions(+), 49 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 46e937377bc6..20a7f26e8686 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -597,8 +597,7 @@ def test_cudnn_dropout_reproducibility(): assert_almost_equal(a.grad, b.grad) @mx.util.use_np -@pytest.mark.serial -def test_cuda_graphs(capsys): +def test_cuda_graphs(): class GraphTester(gluon.HybridBlock): def __init__(self, function_to_test, **kwargs): super(GraphTester, self).__init__(**kwargs) @@ -654,56 +653,54 @@ def generate_inputs(self): N = 10 - with capsys.disabled(): - with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', - 'MXNET_CUDA_GRAPHS_VERBOSE': '1', - 'MXNET_USE_FUSION': '0'}): - device = mx.gpu(0) - for test_desc in tested_ops: - sys.stdout.write('Testing {}\n'.format( test_desc.name)) - inputs = test_desc.generate_inputs() - inputsg = [i.copy() for i in inputs] - for i in inputsg: - i.attach_grad() - seed = random.randint(0, 10000) - net = GraphTester(test_desc.f) - netg = GraphTester(test_desc.f) - - # initialize parameters - net.initialize(device=device) - netg.initialize(device=device) - - net(*inputs) + with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', + 'MXNET_USE_FUSION': '0'}): + device = mx.gpu(0) + for test_desc in tested_ops: + print("Testing ", test_desc.name) + inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - p2.set_data(p1.data()) + # initialize parameters + net.initialize(device=device) + netg.initialize(device=device) - netg.hybridize(static_alloc=True, static_shape=True) + net(*inputs) - print(" Testing inference mode") - with random_seed(seed): - for _ in range(N): - assert_almost_equal(net(*inputs), netg(*inputsg)) + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) - mx.npx.waitall() - print(" Testing training mode") + netg.hybridize(static_alloc=True, static_shape=True) + + print("Testing inference mode") + with random_seed(seed): for _ in range(N): - with random_seed(seed): - with mx.autograd.record(): - out = net(*inputs) - out.backward() - - with random_seed(seed): - with mx.autograd.record(): - outg = netg(*inputsg) - outg.backward() - - assert_almost_equal(out, outg) - for i, ig in zip(inputs, inputsg): - assert_almost_equal(i.grad, ig.grad) - - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - assert_almost_equal(p1.data(), p2.data()) - if p1.grad_req != 'null': - assert_almost_equal(p1.grad(), p2.grad()) + assert_almost_equal(net(*inputs), netg(*inputsg)) + + mx.npx.waitall() + print("Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) mx.npx.waitall() From b7ecce28254fa74692fb4269752d86886d0b33c5 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 9 Mar 2022 16:58:20 -0800 Subject: [PATCH 27/30] Quiet 'unused variable' compilation warning --- src/imperative/cuda_graphs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index ee2ba00aea84..06fe50ef4a5f 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -304,7 +304,7 @@ class CudaGraphsSubSegExec { << ".dot"; CUDA_CALL(cudaGraphDebugDotPrint(graph_.get(), filename.str().c_str(), dotfile_flags)); #else - static bool dot_file_unsupported = []() { + [[maybe_unused]] static bool dot_file_unsupported = []() { // NOLINT LOG(INFO) << "MXNET_CUDA_GRAPHS_DBG_FILE setting ignored- requires CUDA version >= 11.3"; return true; }(); From c609ccea31036ac1f4fe4c830232f412c97e8eff Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 9 Mar 2022 21:02:03 -0800 Subject: [PATCH 28/30] Trigger CI From eaf61a02d3340a78cff36ba308ff5c6934ba8435 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 15 Mar 2022 13:45:48 -0700 Subject: [PATCH 29/30] Check of FCreateOpState removed given new check for FStatefulCompute* --- src/imperative/cuda_graphs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/imperative/cuda_graphs.h b/src/imperative/cuda_graphs.h index 06fe50ef4a5f..c9e16d84e8b3 100644 --- a/src/imperative/cuda_graphs.h +++ b/src/imperative/cuda_graphs.h @@ -511,7 +511,6 @@ class CudaGraphsExec { // Is the Op OK to make part of a CUDA Graph? bool OpOK(const std::shared_ptr& exec) { static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); - static auto& fstateful = Op::GetAttr("FCreateOpState"); static auto& fcompute_ex = Op::GetAttr("FComputeEx"); static auto& fstatefulcompute = Op::GetAttr("FStatefulCompute"); static auto& fstatefulcompute_ex = Op::GetAttr("FStatefulComputeEx"); @@ -521,8 +520,7 @@ class CudaGraphsExec { if (f != nullptr) { return f(attrs, exec->op_ctx.is_train); } - if (fstateful.get(attrs.op, nullptr) != nullptr || - fstatefulcompute.get(attrs.op, nullptr) != nullptr || + if (fstatefulcompute.get(attrs.op, nullptr) != nullptr || fstatefulcompute_ex.get(attrs.op, nullptr) != nullptr) { if (verbose_) { LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; From f4510273fed48bc7b94cbcb0cc9c9dfe4aec96d3 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 15 Mar 2022 16:55:10 -0700 Subject: [PATCH 30/30] Revert "Temporarily add '-s' to pytest serial tests" This reverts commit 5a2f847558a7f55790f1ad1fb5ee930b4ad1a3a9. --- ci/docker/runtime_functions.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 69453c39e538..05f80032cd15 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -801,8 +801,7 @@ cd_unittest_ubuntu() { local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"} OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest - # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error - pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest + pytest -m 'serial' --durations=50 --verbose tests/python/unittest # https://github.com/apache/incubator-mxnet/issues/11801 # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then