diff --git a/doc/env_var.md b/doc/env_var.md new file mode 100644 index 000000000000..ccaf7f2b61d8 --- /dev/null +++ b/doc/env_var.md @@ -0,0 +1,22 @@ +Environment Variables +===================== +MXNet have several settings that can be changed via environment variable. +Usually you do not need to change these settings, but they are listed here for reference. + +* MXNET_GPU_WORKER_NTHREADS (default=1 when number of cpu<8, otherwise 2) + - Maximum number of threads that do the computation job on each GPU. +* MXNET_GPU_COPY_NTHREADS (default=1) + - Maximum number of threads that do memory copy job on each GPU. +* MXNET_CPU_WORKER_NTHREADS (default=1) + - Maximum number of threads that do the CPU computation job. +* MXNET_EXEC_ENABLE_INPLACE (default=true) + - Whether to enable inplace optimization in symbolic execution. +* MXNET_EXEC_MATCH_RANGE (default=10) + - The rough matching scale in symbolic execution memory allocator. + - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose). +* MXNET_ENGINE_TYPE (default=ThreadedEnginePerDevice) + - The type of underlying execution engine of MXNet. + - List of choices + - NaiveEngine: very simple engine that use master thread to do computation. + - ThreadedEngine: a threaded engine that uses global thread pool to schedule jobs. + - ThreadedEnginePerDevice: a threaded engine that allocates thread per GPU. diff --git a/doc/index.md b/doc/index.md index 644289e2f095..12f1be6b8591 100644 --- a/doc/index.md +++ b/doc/index.md @@ -16,10 +16,10 @@ User Guide Developer Guide --------------- * [Developer Documents](developer-guide/index.md) +* [Environment Variables for MXNet](env_var.md) * [Contributor Guideline](contribute.md) * [Doxygen Version of C++ API](https://mxnet.readthedocs.org/en/latest/doxygen) - Indices and tables ------------------ diff --git a/src/common/utils.h b/src/common/utils.h index ffa5c349c65c..bb1315b139db 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -11,13 +11,26 @@ #include #include #include +#include #endif // DMLC_USE_CXX11 +#include + namespace mxnet { namespace common { #if DMLC_USE_CXX11 +// heuristic to dermine number of threads per GPU +inline int GetNumThreadPerGPU() { + int nthread = std::thread::hardware_concurrency(); + if (nthread < 8) { + return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1); + } else { + return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2); + } +} + /*! * \brief Random Engine */ diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc index c2848e36d831..ad4612718abf 100644 --- a/src/engine/threaded_engine_perdevice.cc +++ b/src/engine/threaded_engine_perdevice.cc @@ -4,12 +4,14 @@ * \brief ThreadedEngine that uses fix amount of thread for each device. */ #include +#include #include #include #include #include "./threaded_engine.h" #include "./thread_pool.h" #include "../common/lazy_alloc_array.h" +#include "../common/utils.h" namespace mxnet { namespace engine { @@ -24,8 +26,8 @@ namespace engine { class ThreadedEnginePerDevice : public ThreadedEngine { public: ThreadedEnginePerDevice() noexcept(false) { - cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 2); - gpu_worker_nthreads_ = dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2); + cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1); + gpu_worker_nthreads_ = common::GetNumThreadPerGPU(); gpu_copy_nthreads_ = dmlc::GetEnv("MXNET_GPU_COPY_NTHREADS", 1); // create CPU task cpu_worker_.reset(new ThreadWorkerBlock()); diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc index 7293defa04c6..42eea45e86e7 100644 --- a/src/io/iter_image_recordio.cc +++ b/src/io/iter_image_recordio.cc @@ -175,7 +175,7 @@ inline void ImageRecordIOParser::Init( #pragma omp parallel { // be conservative, set number of real cores - maxthread = std::max(omp_get_num_procs() / 2, 1); + maxthread = std::max(omp_get_num_procs() / 2 - 1, 1); } param_.preprocess_threads = std::min(maxthread, param_.preprocess_threads); #pragma omp parallel num_threads(param_.preprocess_threads) diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h index 759c79ad5452..c969fc1aaa86 100644 --- a/src/symbol/graph_memory_allocator.h +++ b/src/symbol/graph_memory_allocator.h @@ -13,6 +13,7 @@ #include #include "./static_graph.h" #include "./graph_algorithm.h" +#include "../common/utils.h" namespace mxnet { /*! @@ -119,6 +120,8 @@ GraphStorageAllocator::GraphStorageAllocator( // color based match will cost a bit more memory usually // but also enables more parallelization. num_match_color_ = dmlc::GetEnv("MXNET_EXEC_MATCH_NUM_COLOR", 4); + num_match_color_ = std::min(static_cast(common::GetNumThreadPerGPU()), + num_match_color_); this->InitColor(topo_order); }