apache · tqchen · Sep 23, 2015 · Sep 23, 2015
diff --git a/doc/env_var.md b/doc/env_var.md
@@ -0,0 +1,22 @@
+Environment Variables
+=====================
+MXNet have several settings that can be changed via environment variable.
+Usually you do not need to change these settings, but they are listed here for reference.
+
+* MXNET_GPU_WORKER_NTHREADS (default=1 when number of cpu<8, otherwise 2)
+  - Maximum number of threads that do the computation job on each GPU.
+* MXNET_GPU_COPY_NTHREADS (default=1)
+  - Maximum number of threads that do memory copy job on each GPU.
+* MXNET_CPU_WORKER_NTHREADS (default=1)
+  - Maximum number of threads that do the CPU computation job.
+* MXNET_EXEC_ENABLE_INPLACE (default=true)
+  - Whether to enable inplace optimization in symbolic execution.
+* MXNET_EXEC_MATCH_RANGE (default=10)
+  - The rough matching scale in symbolic execution memory allocator.
+  - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose).
+* MXNET_ENGINE_TYPE (default=ThreadedEnginePerDevice)
+  - The type of underlying execution engine of MXNet.
+  - List of choices
+    - NaiveEngine: very simple engine that use master thread to do computation.
+    - ThreadedEngine: a threaded engine that uses global thread pool to schedule jobs.
+    - ThreadedEnginePerDevice: a threaded engine that allocates thread per GPU.
diff --git a/doc/index.md b/doc/index.md
@@ -16,10 +16,10 @@ User Guide
 Developer Guide
 ---------------
 * [Developer Documents](developer-guide/index.md)
+* [Environment Variables for MXNet](env_var.md)
 * [Contributor Guideline](contribute.md)
 * [Doxygen Version of C++ API](https://mxnet.readthedocs.org/en/latest/doxygen)
 
-
 Indices and tables
 ------------------
 

diff --git a/src/common/utils.h b/src/common/utils.h
@@ -11,13 +11,26 @@
 #include <type_traits>
 #include <utility>
 #include <random>
+#include <thread>
 #endif  // DMLC_USE_CXX11
 
+#include <dmlc/logging.h>
+
 namespace mxnet {
 namespace common {
 
 #if DMLC_USE_CXX11
 
+// heuristic to dermine number of threads per GPU
+inline int GetNumThreadPerGPU() {
+  int nthread = std::thread::hardware_concurrency();
+  if (nthread < 8) {
+    return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1);
+  } else {
+    return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2);
+  }
+}
+
 /*!
  * \brief Random Engine
  */

diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
@@ -4,12 +4,14 @@
  * \brief ThreadedEngine that uses fix amount of thread for each device.
  */
 #include <dmlc/base.h>
+#include <dmlc/omp.h>
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <dmlc/concurrency.h>
 #include "./threaded_engine.h"
 #include "./thread_pool.h"
 #include "../common/lazy_alloc_array.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace engine {
@@ -24,8 +26,8 @@ namespace engine {
 class ThreadedEnginePerDevice : public ThreadedEngine {
  public:
   ThreadedEnginePerDevice() noexcept(false) {
-    cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 2);
-    gpu_worker_nthreads_ = dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2);
+    cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+    gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
     gpu_copy_nthreads_ = dmlc::GetEnv("MXNET_GPU_COPY_NTHREADS", 1);
     // create CPU task
     cpu_worker_.reset(new ThreadWorkerBlock());

diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
@@ -175,7 +175,7 @@ inline void ImageRecordIOParser::Init(
   #pragma omp parallel
   {
     // be conservative, set number of real cores
-    maxthread = std::max(omp_get_num_procs() / 2, 1);
+    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
   }
   param_.preprocess_threads = std::min(maxthread, param_.preprocess_threads);
   #pragma omp parallel num_threads(param_.preprocess_threads)

diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include "./static_graph.h"
 #include "./graph_algorithm.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 /*!
@@ -119,6 +120,8 @@ GraphStorageAllocator::GraphStorageAllocator(
   // color based match will cost a bit more memory usually
   // but also enables more parallelization.
   num_match_color_ = dmlc::GetEnv("MXNET_EXEC_MATCH_NUM_COLOR", 4);
+  num_match_color_ = std::min(static_cast<uint32_t>(common::GetNumThreadPerGPU()),
+                              num_match_color_);
   this->InitColor(topo_order);
 }