diff --git a/Makefile b/Makefile
index d758c443241e..74c71a9cbc07 100644
--- a/Makefile
+++ b/Makefile
@@ -13,10 +13,19 @@ ifndef DMLC_CORE
 endif
 
 
+ifneq ($(USE_OPENMP_ITER), 1)
+	export NO_OPENMP = 1
+endif
+
+ifneq ($(USE_OPENMP_ITER), 1)
+	export NO_OPENMP = 1
+endif
+
 # use customized config file
 include $(config)
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
+unexport NO_OPENMP
 
 # all tge possible warning tread
 WARNFLAGS= -Wall
@@ -39,10 +48,21 @@ endif
 
 # setup opencv
 ifeq ($(USE_OPENCV),1)
-	CFLAGS+= -DCXXNET_USE_OPENCV=1
+	CFLAGS+= -DMXNET_USE_OPENCV=1
 	LDFLAGS+= `pkg-config --libs opencv`
 else
-	CFLAGS+= -DCXXNET_USE_OPENCV=0
+	CFLAGS+= -DMXNET_USE_OPENCV=0
+endif
+
+# setup opencv
+ifeq ($(USE_OPENCV_DECODER),1)
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=1
+else
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=0
+endif
+
+ifeq ($(USE_OPENMP_ITER), 1)
+	CFLAGS += -fopenmp
 endif
 
 ifeq ($(USE_CUDNN), 1)
@@ -62,7 +82,7 @@ endif
 ENGINE=naive_engine.o
 BIN = tests/test_simple_engine
 OBJ = narray_function_cpu.o
-OBJCXX11 = narray.o c_api.o operator.o symbol.o storage.o static_graph.o graph_executor.o io.o iter_mnist.o $(ENGINE)
+OBJCXX11 = narray.o c_api.o operator.o symbol.o storage.o static_graph.o graph_executor.o io.o iter_mnist.o iter_image_recordio.o $(ENGINE)
 CUOBJ = narray_function_gpu.o
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -92,6 +112,7 @@ operator.o: src/operator/operator.cc
 c_api.o: src/c_api.cc
 io.o: src/io/io.cc
 iter_mnist.o: src/io/iter_mnist.cc src/io/*.h
+iter_image_recordio.o: src/io/iter_image_recordio.cc src/io/*.h
 
 # Rules for operators
 OPERATOR_HDR=$(wildcard src/operator/*-inl.h)
diff --git a/dmlc-core b/dmlc-core
index 75f1950d386d..7d3c78428819 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 75f1950d386d033b0b64919017515d27e698962a
+Subproject commit 7d3c78428819dc84c4da8ae1f302ba6c6a235a5d
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 20694b7064da..14d9bd1b8971 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -144,8 +144,109 @@ def RandomInit(narray):
 flatten = mx.symbol.Flatten(data=pool, name="flatten1")
 fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
 loss = mx.symbol.Softmax(data=fc, name="softmax")
+args_list = loss.list_arguments()
 
 data_shape = (128, 3, 28, 28)
 arg_shapes, out_shapes, aux_shapes = loss.infer_shape(data=data_shape)
 
+arg_narrays = [mx.narray.create(shape, ctx=mx.Context("gpu")) for shape in arg_shapes]
+grad_narrays = [mx.narray.create(shape, ctx=mx.Context("gpu")) for shape in arg_shapes]
 
+inputs = dict(zip(args_list, arg_narrays))
+
+name2shape = dict(zip(args_list, arg_shapes))
+pred = mx.narray.create(out_shapes[0])
+
+np.random.seed(0)
+# set random weight
+for name, narray in inputs.items():
+    if "weight" in name:
+        tmp = mx.narray.create(name2shape[name])
+        tmp.numpy[:] = np.random.uniform(-0.07, 0.07, name2shape[name])
+        tmp.copyto(narray)
+    if "bias" in name:
+        narray[:] = 0.0
+
+# bind executer
+# TODO(bing): think of a better bind interface
+executor = loss.bind(mx.Context('gpu'), arg_narrays, grad_narrays)
+# update
+
+out_narray = executor.heads()[0]
+grad_narray = mx.narray.create(out_narray.shape)
+
+epoch = 9
+lr = 0.1
+wd = 0.0004
+
+def Update(grad, weight):
+    weight[:] -= lr * grad  / batch_size
+
+block = list(zip(grad_narrays, arg_narrays))
+
+#check data
+get_data.GetCifar10()
+train_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        input_shape=(3,28,28),
+        batch_size=128,
+        nthread=1)
+test_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/test.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        input_shape=(3,28,28),
+        batch_size=100,
+        nthread=1)
+
+tmp_label = mx.narray.create(name2shape["sm_label"])
+
+def test_cifar():
+    acc_train = 0.
+    acc_val = 0.
+    for i in range(epoch):
+        # train
+        print("Epoch %d" % i)
+        train_acc = 0.0
+        val_acc = 0.0
+        train_nbatch = 0
+        val_nbatch = 0
+        for data, label in train_dataiter:
+            data = data
+            tmp_label.numpy[:] = label.numpy.reshape(tmp_label.shape)
+            data.copyto(inputs["data"])
+            tmp_label.copyto(inputs["sm_label"])
+            executor.forward()
+            out_narray.copyto(pred)
+            train_acc += CalAcc(pred.numpy, label.numpy.flatten())
+            train_nbatch += 1
+            out_narray.copyto(grad_narray)
+            executor.backward([grad_narray])
+
+            for grad, weight in block:
+                Update(grad, weight)
+
+        # evaluate
+        for data, label in val_dataiter:
+            data = data
+            label = label.numpy.flatten()
+            data.copyto(inputs["data"])
+            executor.forward()
+            out_narray.copyto(pred)
+            val_acc += CalAcc(pred.numpy, label)
+            val_nbatch += 1
+        acc_train = train_acc / train_nbatch
+        acc_val = val_acc / val_nbatch
+        print("Train Acc: ", train_acc / train_nbatch)
+        print("Valid Acc: ", val_acc / val_nbatch)
+        train_dataiter.reset()
+        val_dataiter.reset()
+    assert(acc_train > 0.98)
+    assert(acc_val > 0.97)
+
+if __name__ == "__main__":
+    test_cifar()
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 47a59eec54fe..7bb86f4eece3 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -109,5 +109,21 @@ struct DataIteratorReg
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
   .set_body(__create__ ## DataIteratorType ## __)
+/*!
+ * \brief Macro to register chained Iterators
+ *
+ * \code
+ * // example of registering a imagerec iterator
+ * MXNET_REGISTER_IO_CHAINED_ITERATOR(ImageRec, ImageRecordIter, BatchIter)
+ * .describe("batched image record data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
+    return new HoldingDataIterType(new ChainedDataIterType);                                    \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## ChainedDataIteratorType ## __)
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/make/config.mk b/make/config.mk
index cd04b146180c..3e93e240e493 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -27,8 +27,8 @@ USE_CUDA_PATH = NONE
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
-USE_OPENCV = 0
-USE_OPENCV_DECODER = 0
+USE_OPENCV = 1
+USE_OPENCV_DECODER = 1
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 # add the path to CUDNN libary to link and compile flag
diff --git a/src/common/utils.h b/src/common/utils.h
index cf1fd2f1bb36..29cb9f0e2f2a 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -21,6 +21,11 @@ namespace common {
  * \brief Random Engine
  */
 typedef std::mt19937 RANDOM_ENGINE;
+// Get a double float, prnd is the pointer to a Random Engine
+#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd)
+// Get a random int in [0, range)
+#define NextUInt32(range, prnd) static_cast<uint32_t> \
+(floor(std::generate_canonical<float, 10>(*prnd) * range))
 
 /*!
  * \brief Helper functions.
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
new file mode 100644
index 000000000000..a4b77f5a41df
--- /dev/null
+++ b/src/io/image_augmenter.h
@@ -0,0 +1,410 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file image_augmenter_opencv.hpp
+ * \brief threaded version of page iterator
+ * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
+ */
+#ifndef MXNET_IO_IMAGE_AUGMENTER_H_
+#define MXNET_IO_IMAGE_AUGMENTER_H_
+
+#include <opencv2/opencv.hpp>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace io {
+/*! \brief image augmentation parameters*/
+struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
+  /*! \brief whether we do random cropping */
+  bool rand_crop;
+  /*! \brief whether we do nonrandom croping */
+  int crop_y_start;
+  /*! \brief whether we do nonrandom croping */
+  int crop_x_start;
+  /*! \brief [-max_rotate_angle, max_rotate_angle] */
+  int max_rotate_angle;
+  /*! \brief max aspect ratio */
+  float max_aspect_ratio;
+  /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
+  float max_shear_ratio;
+  /*! \brief max crop size */
+  int max_crop_size;
+  /*! \brief min crop size */
+  int min_crop_size;
+  /*! \brief max scale ratio */
+  float max_random_scale;
+  /*! \brief min scale_ratio */
+  float min_random_scale;
+  /*! \brief min image size */
+  float min_img_size;
+  /*! \brief max image size */
+  float max_img_size;
+  /*! \brief rotate angle */
+  int rotate;
+  /*! \brief filled color while padding */
+  int fill_value;
+  // The following are params for tensor process
+  /*! \brief whether to mirror the image */
+  bool mirror;
+  /*! \brief whether to perform rand mirror the image */
+  bool rand_mirror;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief mean value for r channel */
+  float mean_r;
+  /*! \brief mean value for g channel */
+  float mean_g;
+  /*! \brief mean value for b channel */
+  float mean_b;
+  /*! \brief shape of the image data*/
+  TShape input_shape;
+  /*! \brief scale on color space */
+  float scale;
+  /*! \brief maximum ratio of contrast variation */
+  float max_random_contrast;
+  /*! \brief maximum value of illumination variation */
+  float max_random_illumination;
+  /*! \brief whether to print augment info */
+  bool silent;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
+    DMLC_DECLARE_FIELD(rand_crop).set_default(true)
+        .describe("Whether we de random cropping");
+    DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
+        .describe("Where to nonrandom crop on y");
+    DMLC_DECLARE_FIELD(crop_x_start).set_default(-1)
+        .describe("Where to nonrandom crop on x");
+    DMLC_DECLARE_FIELD(max_rotate_angle).set_default(0.0f)
+        .describe("Rotate can be [-max_rotate_angle, max_rotate_angle]");
+    DMLC_DECLARE_FIELD(max_aspect_ratio).set_default(0.0f)
+        .describe("Max aspect ratio");
+    DMLC_DECLARE_FIELD(max_shear_ratio).set_default(0.0f)
+        .describe("Shear rotate can be made between [-max_shear_ratio_, max_shear_ratio_]");
+    DMLC_DECLARE_FIELD(max_crop_size).set_default(-1)
+        .describe("Maximum crop size");
+    DMLC_DECLARE_FIELD(min_crop_size).set_default(-1)
+        .describe("Minimum crop size");
+    DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
+        .describe("Maxmum scale ratio");
+    DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
+        .describe("Minimum scale ratio");
+    DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
+        .describe("Maxmum image size");
+    DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
+        .describe("Minimum image size");
+    DMLC_DECLARE_FIELD(rotate).set_default(-1.0f)
+        .describe("Rotate angle");
+    DMLC_DECLARE_FIELD(fill_value).set_default(255)
+        .describe("Filled value while padding");
+    DMLC_DECLARE_FIELD(mirror).set_default(false)
+        .describe("Whether to mirror the image");
+    DMLC_DECLARE_FIELD(rand_mirror).set_default(false)
+        .describe("Whether to mirror the image randomly");
+    DMLC_DECLARE_FIELD(mean_img).set_default("")
+        .describe("Mean Image to be subtracted");
+    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
+        .describe("Mean value on R channel");
+    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
+        .describe("Mean value on G channel");
+    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
+        .describe("Mean value on B channel");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Scale in color space");
+    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
+        .describe("Maximum ratio of contrast variation");
+    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
+        .describe("Maximum value of illumination variation");
+  }
+};
+
+/*! \brief helper class to do image augmentation */
+class ImageAugmenter {
+ public:
+  // contructor
+  ImageAugmenter(void)
+      : tmpres_(false),
+        rotateM_(2, 3, CV_32F) {
+  }
+  virtual ~ImageAugmenter() {
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    for (size_t i = 0; i < kwargs_left.size(); i++) {
+        if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
+          const char* val = kwargs_left[i].second.c_str();
+          const char *end = val + strlen(val);
+          char buf[128];
+          while (val < end) {
+            sscanf(val, "%[^,]", buf);
+            val += strlen(buf) + 1;
+            rotate_list_.push_back(atoi(buf));
+          }
+        }
+    }
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        meanfile_ready_ = false;
+      } else {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual cv::Mat OpencvProcess(const cv::Mat &src,
+                          common::RANDOM_ENGINE *prnd) {
+    // shear
+    float s = NextDouble(prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
+    // rotate
+    int angle = NextUInt32(param_.max_rotate_angle * 2, prnd) - param_.max_rotate_angle;
+    if (param_.rotate > 0) angle = param_.rotate;
+    if (rotate_list_.size() > 0) {
+      angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
+    }
+    float a = cos(angle / 180.0 * M_PI);
+    float b = sin(angle / 180.0 * M_PI);
+    // scale
+    float scale = NextDouble(prnd) * \
+        (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
+    // aspect ratio
+    float ratio = NextDouble(prnd) * \
+        param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
+    float hs = 2 * scale / (1 + ratio);
+    float ws = ratio * hs;
+    // new width and height
+    float new_width = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.cols));
+    float new_height = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.rows));
+    cv::Mat M(2, 3, CV_32F);
+    M.at<float>(0, 0) = hs * a - s * b * ws;
+    M.at<float>(1, 0) = -b * ws;
+    M.at<float>(0, 1) = hs * b + s * a * ws;
+    M.at<float>(1, 1) = a * ws;
+    float ori_center_width = M.at<float>(0, 0) * src.cols + M.at<float>(0, 1) * src.rows;
+    float ori_center_height = M.at<float>(1, 0) * src.cols + M.at<float>(1, 1) * src.rows;
+    M.at<float>(0, 2) = (new_width - ori_center_width) / 2;
+    M.at<float>(1, 2) = (new_height - ori_center_height) / 2;
+    cv::warpAffine(src, temp_, M, cv::Size(new_width, new_height),
+                     cv::INTER_LINEAR,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
+    cv::Mat res = temp_;
+    // crop
+    if (param_.max_crop_size != -1 || param_.min_crop_size != -1) {
+      CHECK(res.cols >= param_.max_crop_size && res.rows >= \
+              param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
+          << "input image size smaller than max_crop_size";
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size \
+              - param_.min_crop_size+1, prnd)+ param_.min_crop_size;
+      mshadow::index_t y = res.rows - rand_crop_size;
+      mshadow::index_t x = res.cols - rand_crop_size;
+      if (param_.rand_crop != 0) {
+        y = NextUInt32(y + 1, prnd);
+        x = NextUInt32(x + 1, prnd);
+      } else {
+        y /= 2; x /= 2;
+      }
+      cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
+      cv::resize(res(roi), res, cv::Size(param_.input_shape[1], param_.input_shape[2]));
+    } else {
+        CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] \
+                && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2])
+            << "input image size smaller than input shape";
+        mshadow::index_t y = res.rows - param_.input_shape[2];
+        mshadow::index_t x = res.cols - param_.input_shape[1];
+        if (param_.rand_crop != 0) {
+            y = NextUInt32(y + 1, prnd);
+            x = NextUInt32(x + 1, prnd);
+        } else {
+            y /= 2; x /= 2;
+        }
+        cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
+        res = res(roi);
+    }
+    return res;
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual mshadow::Tensor<cpu, 3> OpencvProcess(mshadow::Tensor<cpu, 3> data,
+                                          common::RANDOM_ENGINE *prnd) {
+    if (!NeedOpencvProcess()) return data;
+    cv::Mat res(data.size(1), data.size(2), CV_8UC3);
+    for (index_t i = 0; i < data.size(1); ++i) {
+      for (index_t j = 0; j < data.size(2); ++j) {
+        res.at<cv::Vec3b>(i, j)[0] = data[2][i][j];
+        res.at<cv::Vec3b>(i, j)[1] = data[1][i][j];
+        res.at<cv::Vec3b>(i, j)[2] = data[0][i][j];
+      }
+    }
+    res = this->OpencvProcess(res, prnd);
+    tmpres_.Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < tmpres_.size(1); ++i) {
+      for (index_t j = 0; j < tmpres_.size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        tmpres_[0][i][j] = bgr[2];
+        tmpres_[1][i][j] = bgr[1];
+        tmpres_[2][i][j] = bgr[0];
+      }
+    }
+    return tmpres_;
+  }
+
+  void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    // Check Newly Created mean image
+    if (meanfile_ready_ == false && param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi != NULL) {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
+    if (param_.input_shape[1] == 1) {
+      img_ = (*p_data) * param_.scale;
+    } else {
+      CHECK(p_data->size(1) >= param_.input_shape[1] && p_data->size(2) >= param_.input_shape[2])
+          << "Data size must be bigger than the input size to net.";
+      mshadow::index_t yy = p_data->size(1) - param_.input_shape[1];
+      mshadow::index_t xx = p_data->size(2) - param_.input_shape[2];
+      if (param_.rand_crop != 0 && (yy != 0 || xx != 0)) {
+        yy = NextUInt32(yy + 1, prnd);
+        xx = NextUInt32(xx + 1, prnd);
+      } else {
+        yy /= 2; xx /= 2;
+      }
+      if (p_data->size(1) != param_.input_shape[1] && param_.crop_y_start != -1) {
+        yy = param_.crop_y_start;
+      }
+      if (p_data->size(2) != param_.input_shape[2] && param_.crop_x_start != -1) {
+        xx = param_.crop_x_start;
+      }
+      float contrast = NextDouble(prnd) * param_.max_random_contrast \
+                       * 2 - param_.max_random_contrast + 1;
+      float illumination = NextDouble(prnd) * param_.max_random_illumination \
+                           * 2 - param_.max_random_illumination;
+      if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
+        // substract mean value
+        (*p_data)[0] -= param_.mean_b;
+        (*p_data)[1] -= param_.mean_g;
+        (*p_data)[2] -= param_.mean_r;
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data) * contrast + illumination, \
+                      img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data) * contrast + illumination, \
+                  img_[0].shape_, yy, xx) * param_.scale;
+        }
+      } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
+        // do not substract anything
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data), img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale;
+        }
+      } else {
+        // substract mean image
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = mirror(crop(((*p_data) - meanimg_) * contrast \
+                        + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+          } else {
+            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) \
+                    * contrast + illumination) * param_.scale;
+          }
+        } else {
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, \
+                    img_[0].shape_, yy, xx) * param_.scale;
+          } else {
+            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * \
+                    contrast + illumination) * param_.scale;
+          }
+        }
+      }
+    }
+    (*p_data) = img_;
+  }
+
+  virtual void Process(unsigned char *dptr, size_t sz,
+                       mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    cv::Mat buf(1, sz, CV_8U, dptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    if (NeedOpencvProcess())
+        res = this->OpencvProcess(res, prnd);
+    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < p_data->size(1); ++i) {
+      for (index_t j = 0; j < p_data->size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        (*p_data)[0][i][j] = bgr[2];
+        (*p_data)[1][i][j] = bgr[1];
+        (*p_data)[2][i][j] = bgr[0];
+      }
+    }
+    res.release();
+    this->TensorProcess(p_data, prnd);
+  }
+
+ private:
+  // whether skip opencv processing
+  inline bool NeedOpencvProcess(void) const {
+    if (param_.max_rotate_angle > 0 || param_.max_shear_ratio > 0.0f
+        || param_.rotate > 0 || rotate_list_.size() > 0) return true;
+    if (param_.min_crop_size > 0 && param_.max_crop_size > 0) return true;
+    return false;
+  }
+  // temp input space
+  mshadow::TensorContainer<cpu, 3> tmpres_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
+  // temporal space
+  cv::Mat temp_;
+  // rotation param
+  cv::Mat rotateM_;
+  // whether the mean file is ready
+  bool meanfile_ready_;
+  // parameters
+  ImageAugmentParam param_;
+  /*! \brief input shape */
+  mshadow::Shape<4> shape_;
+  /*! \brief list of possible rotate angle */
+  std::vector<int> rotate_list_;
+};
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_AUGMENTER_H_
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
new file mode 100644
index 000000000000..3b4fa0302435
--- /dev/null
+++ b/src/io/image_recordio.h
@@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file image_recordio.h
+ * \brief image recordio struct
+ */
+#ifndef MXNET_IO_IMAGE_RECORDIO_H_
+#define MXNET_IO_IMAGE_RECORDIO_H_
+
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <string>
+
+namespace mxnet {
+namespace io {
+/*! \brief image recordio struct */
+struct ImageRecordIO {
+  /*! \brief header in image recordio */
+  struct Header {
+    /*!
+     * \brief flag of the header,
+     *  used for future extension purposes
+     */
+    uint32_t flag;
+    /*!
+     * \brief label field that returns label of images
+     *  when image list was not presented,
+     * 
+     * NOTE: user do not need to repack recordio just to
+     * change label field, just supply a list file that
+     * maps image id to new labels
+     */
+    float label;
+    /*!
+     * \brief unique image index
+     *  image_id[1] is always set to 0,
+     *  reserved for future purposes for 128bit id
+     *  image_id[0] is used to store image id
+     */
+    uint64_t image_id[2];
+  };
+  /*! \brief header of image recordio */
+  Header header;
+  /*! \brief pointer to data content */
+  uint8_t *content;
+  /*! \brief size of the content */
+  size_t content_size;
+  /*! \brief constructor */
+  ImageRecordIO(void)
+      : content(NULL), content_size(0) {
+    memset(&header, 0, sizeof(header));
+  }
+  /*! \brief get image id from record */
+  inline uint64_t image_index(void) const {
+    return header.image_id[0];
+  }
+  /*!
+   * \brief load header from a record content 
+   * \param buf the head of record
+   * \param size the size of the entire record   
+   */
+  inline void Load(void *buf, size_t size) {
+    CHECK(size >= sizeof(header));
+    std::memcpy(&header, buf, sizeof(header));
+    content = reinterpret_cast<uint8_t*>(buf) + sizeof(header);
+    content_size = size - sizeof(header);
+  }
+  /*!
+   * \brief save the record header
+   */
+  inline void SaveHeader(std::string *blob) const {
+    blob->resize(sizeof(header));
+    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));
+  }
+};
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_RECORDIO_H_
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 1ae734631680..ed560fc2b5da 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -1,17 +1,21 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \inst_vector.h
+ * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
  */
+
 #ifndef MXNET_IO_INST_VECTOR_H_
 #define MXNET_IO_INST_VECTOR_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
 #include <vector>
-#include <string>
-#include "./data.h"
+
 namespace mxnet {
+namespace io {
 /*!
  * \brief tensor vector that can store sequence of tensor
  *  in a memory compact way, tensors do not have to be of same shape
@@ -28,7 +32,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        (reinterpret_cast<DType*>(BeginPtr(content_)) + offset_[i], shape_[i]);
+        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);  // NOLINT(*)
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -59,35 +63,6 @@ class TensorVector {
   std::vector<mshadow::Shape<dim> > shape_;
 };
 
-/*!
- * \brief tblob vector that can store sequence of tblob
- *  in a memory compact way, tblobs do not have to be of same shape
- */
-template<typename DType>
-class TBlobVector {
- public:
-  TBlobVector(void) {
-    this->Clear();
-  }
-  // get i-th tblob
-  inline TBlob operator[](size_t i) const;
-  // get the last tblob
-  inline TBlob Back();
-  // return the size of the vector
-  inline size_t Size(void) const;
-  // push a tensor of certain shape
-  // return the reference of the pushed tensor
-  inline void Push(TShape shape_);
-  inline void Clear(void);
- private:
-  // offset of the data content
-  std::vector<size_t> offset_;
-  // data content
-  std::vector<DType> content_;
-  // shape of data
-  std::vector<TShape > shape_;
-};
-
 /*!
  * \brief instance vector that can holds
  * non-uniform shape data instance in a shape efficient way
@@ -98,20 +73,38 @@ class InstVector {
     return index_.size();
   }
   // instance
-  inline DataInst operator[](size_t i) const;
+  inline DataInst operator[](size_t i) const {
+    DataInst inst;
+    inst.index = index_[i];
+    inst.data.push_back(TBlob(data_[i]));
+    inst.data.push_back(TBlob(label_[i]));
+    return inst;
+  }
   // get back of instance vector
-  inline DataInst Back() const;
-  // clear the container
-  inline void Clear(void);
-  // push the newly coming instance
-  inline void Push(unsigned index, TBlob data_);
+  inline DataInst Back() const {
+    return (*this)[Size() - 1];
+  }
+  inline void Clear(void) {
+    index_.clear();
+    data_.Clear();
+    label_.Clear();
+  }
+  inline void Push(unsigned index,
+                   mshadow::Shape<3> dshape,
+                   mshadow::Shape<1> lshape) {
+    index_.push_back(index);
+    data_.Push(dshape);
+    label_.Push(lshape);
+  }
 
  private:
   /*! \brief index of the data */
   std::vector<unsigned> index_;
+  // label
+  TensorVector<3, real_t> data_;
   // data
-  std::vector<TensorVector<real_t> > data_;
-  // extra data
-  std::vector<std::string> extra_data_;
+  TensorVector<1, real_t> label_;
 };
+}  // namespace io
+}  // namespace mxnet
 #endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index bd5b78dda643..8bfb5dbdd570 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -4,7 +4,18 @@
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
+#include "./image_augmenter.h"
+#include "./iter_batch.h"
 
+// Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
 }  // namespace dmlc
+
+namespace mxnet {
+namespace io {
+// Register parameters in header files
+DMLC_REGISTER_PARAMETER(BatchParam);
+DMLC_REGISTER_PARAMETER(ImageAugmentParam);
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
new file mode 100644
index 000000000000..b45dfd3328e1
--- /dev/null
+++ b/src/io/iter_batch.h
@@ -0,0 +1,172 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_batch_proc-inl.hpp
+ * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
+ * \author Tianqi Chen, Tianjun Xiao
+ */
+#ifndef MXNET_IO_ITER_BATCH_H_
+#define MXNET_IO_ITER_BATCH_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+namespace mxnet {
+namespace io {
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size;
+  /*! \brief input shape */
+  TShape input_shape;
+  /*! \brief label width */
+  index_t label_width;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch;
+  /*! \brief skip read */
+  bool test_skipread;
+  /*! \brief silent */
+  bool silent;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
+        .describe("Label width.");
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
+        .describe("Use round robin to handle overflow batch.");
+    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
+        .describe("Skip read for testing.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to print batch information.");
+  }
+};
+
+/*! \brief create a batch iterator from single instance iterator */
+class BatchAdaptIter: public IIterator<DataBatch> {
+ public:
+  explicit BatchAdaptIter(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
+  virtual ~BatchAdaptIter(void) {
+    delete base_;
+    FreeSpaceDense();
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // init base iterator
+    base_->Init(kwargs);
+    data_shape_[1] = param_.input_shape[0];
+    data_shape_[2] = param_.input_shape[1];
+    data_shape_[3] = param_.input_shape[2];
+    data_shape_[0] = param_.batch_size;
+    AllocSpaceDense(false);
+  }
+  virtual void BeforeFirst(void) {
+    if (param_.round_batch == 0 || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    } else {
+      num_overflow_ = 0;
+    }
+    head_ = 1;
+  }
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    // skip read if in head version
+    if (param_.test_skipread != 0 && head_ == 0)
+        return true;
+    else
+        this->head_ = 0;
+
+    // if overflow from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+
+      if (++ top >= param_.batch_size) {
+        out_.data[0] = TBlob(data);
+        out_.data[1] = TBlob(label);
+        return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.round_batch != 0) {
+        num_overflow_ = 0;
+        base_->BeforeFirst();
+        for (; top < param_.batch_size; ++top, ++num_overflow_) {
+          CHECK(base_->Next()) << "number of input must be bigger than batch size";
+          const DataInst& d = base_->Value();
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
+        }
+        out_.num_batch_padd = num_overflow_;
+      } else {
+        out_.num_batch_padd = param_.batch_size - top;
+      }
+      out_.data[0] = TBlob(data);
+      out_.data[1] = TBlob(label);
+      return true;
+    }
+    return false;
+  }
+  virtual const DataBatch &Value(void) const {
+    CHECK(head_ == 0) << "must call Next to get value";
+    return out_;
+  }
+
+ private:
+  /*! \brief batch parameters */
+  BatchParam param_;
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief on first */
+  int head_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief label information of the data*/
+  mshadow::Tensor<mshadow::cpu, 2> label;
+  /*! \brief content of dense data, if this DataBatch is dense */
+  mshadow::Tensor<mshadow::cpu, 4> data;
+  /*! \brief data shape */
+  mshadow::Shape<4> data_shape_;
+  // Functions that allocate and free tensor space
+  inline void AllocSpaceDense(bool pad = false) {
+    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
+    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.batch_size = param_.batch_size;
+    out_.data.resize(2);
+  }
+  /*! \brief auxiliary function to free space, if needed, dense only */
+  inline void FreeSpaceDense(void) {
+    if (label.dptr_ != NULL) {
+      delete [] out_.inst_index;
+      mshadow::FreeSpace(&label);
+      mshadow::FreeSpace(&data);
+      label.dptr_ = NULL;
+    }
+  }
+};  // class BatchAdaptIter
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
new file mode 100644
index 000000000000..701c28deb4c9
--- /dev/null
+++ b/src/io/iter_image_recordio.cc
@@ -0,0 +1,422 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_image_recordio-inl.hpp
+ * \brief recordio data
+iterator
+ */
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/recordio.h>
+#include <dmlc/threadediter.h>
+#include <unordered_map>
+#include <vector>
+#include <cstdlib>
+#include "./inst_vector.h"
+#include "./image_recordio.h"
+#include "./image_augmenter.h"
+#include "./iter_batch.h"
+namespace mxnet {
+namespace io {
+/*! \brief data structure to hold labels for images */
+class ImageLabelMap {
+ public:
+  /*!
+   * \brief initialize the label list into memory
+   * \param path_imglist path to the image list
+   * \param label_width predefined label_width
+   */
+  explicit ImageLabelMap(const char *path_imglist,
+                         mshadow::index_t label_width,
+                         bool silent) {
+    this->label_width = label_width;
+    image_index_.clear();
+    label_.clear();
+    idx2label_.clear();
+    dmlc::InputSplit *fi = dmlc::InputSplit::Create
+        (path_imglist, 0, 1, "text");
+    dmlc::InputSplit::Blob rec;
+    while (fi->NextRecord(&rec)) {
+      // quick manual parsing
+      char *p = reinterpret_cast<char*>(rec.dptr);
+      char *end = p + rec.size;
+      // skip space
+      while (isspace(*p) && p != end) ++p;
+      image_index_.push_back(static_cast<size_t>(atol(p)));
+      for (size_t i = 0; i < label_width; ++i) {
+        // skip till space
+        while (!isspace(*p) && p != end) ++p;
+        // skip space
+        while (isspace(*p) && p != end) ++p;
+        CHECK(p != end) << "Bad ImageList format";
+        label_.push_back(static_cast<real_t>(atof(p)));
+      }
+    }
+    delete fi;
+    // be careful not to resize label_ afterwards
+    idx2label_.reserve(image_index_.size());
+    for (size_t i = 0; i < image_index_.size(); ++i) {
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width;
+    }
+    if (!silent) {
+      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
+                << image_index_.size() << " Image records";
+    }
+  }
+  /*! \brief find a label for corresponding index */
+  inline mshadow::Tensor<cpu, 1> Find(size_t imid) const {
+    std::unordered_map<size_t, real_t*>::const_iterator it
+        = idx2label_.find(imid);
+    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width));
+  }
+
+ private:
+  // label with_
+  mshadow::index_t label_width;
+  // image index of each record
+  std::vector<size_t> image_index_;
+  // real label content
+  std::vector<real_t> label_;
+  // map index to label
+  std::unordered_map<size_t, real_t*> idx2label_;
+};
+
+// Define image record parser parameters
+struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
+  /*! \brief path to image list */
+  std::string path_imglist;
+  /*! \brief path to image recordio */
+  std::string path_imgrec;
+  /*! \brief number of threads */
+  int nthread;
+  /*! \brief whether to remain silent */
+  bool silent;
+  /*! \brief number of distributed worker */
+  int dist_num_worker, dist_worker_rank;
+  /*! \brief label-width */
+  int label_width;
+  /*! \brief input shape */
+  TShape input_shape;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
+    DMLC_DECLARE_FIELD(path_imglist).set_default("")
+        .describe("Path to image list.");
+    DMLC_DECLARE_FIELD(path_imgrec).set_default("./data/imgrec.rec")
+        .describe("Path to image record file.");
+    DMLC_DECLARE_FIELD(nthread).set_lower_bound(1).set_default(4)
+        .describe("Number of thread to do parsing.");
+    DMLC_DECLARE_FIELD(label_width).set_lower_bound(1).set_default(1)
+        .describe("How many labels for an image.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to output parser information.");
+    DMLC_DECLARE_FIELD(dist_num_worker).set_lower_bound(1).set_default(1)
+        .describe("Dist worker number.");
+    DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
+        .describe("Dist worker rank.");
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+  }
+};
+
+// parser to parse image recordio
+class ImageRecordIOParser {
+ public:
+  ImageRecordIOParser(void)
+      : source_(NULL),
+        label_map_(NULL) {
+  }
+  ~ImageRecordIOParser(void) {
+    // can be NULL
+    delete label_map_;
+    delete source_;
+    for (size_t i = 0; i < augmenters_.size(); ++i) {
+      delete augmenters_[i];
+    }
+    for (size_t i = 0; i < prnds_.size(); ++i) {
+      delete prnds_[i];
+    }
+  }
+  // initialize the parser
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
+
+  // set record to the head
+  inline void BeforeFirst(void) {
+    return source_->BeforeFirst();
+  }
+  // parse next set of records, return an array of
+  // instance vector to the user
+  inline bool ParseNext(std::vector<InstVector> *out);
+
+ private:
+  // magic nyumber to see prng
+  static const int kRandMagic = 111;
+  /*! \brief parameters */
+  ImageRecParserParam param_;
+  /*! \brief augmenters */
+  std::vector<ImageAugmenter*> augmenters_;
+  /*! \brief random samplers */
+  std::vector<common::RANDOM_ENGINE*> prnds_;
+  /*! \brief data source */
+  dmlc::InputSplit *source_;
+  /*! \brief label information, if any */
+  ImageLabelMap *label_map_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
+};
+
+inline void ImageRecordIOParser::Init(
+        const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  // initialize parameter
+  std::vector<std::pair<std::string, std::string> > kwargs_left;
+  // init image rec param
+  kwargs_left = param_.InitAllowUnknown(kwargs);
+  int maxthread, threadget;
+  #pragma omp parallel
+  {
+    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
+  }
+  param_.nthread = std::min(maxthread, param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
+  {
+    threadget = omp_get_num_threads();
+  }
+  param_.nthread = threadget;
+  // setup decoders
+  for (int i = 0; i < threadget; ++i) {
+    augmenters_.push_back(new ImageAugmenter());
+    augmenters_[i]->Init(kwargs);
+    prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
+  }
+
+  // handling for hadoop
+  const char *ps_rank = getenv("PS_RANK");
+  if (ps_rank != NULL) {
+    param_.dist_worker_rank = atoi(ps_rank);
+  }
+
+  if (param_.path_imglist.length() != 0) {
+    label_map_ = new ImageLabelMap(param_.path_imglist.c_str(),
+                                   param_.label_width, param_.silent != 0);
+  } else {
+    param_.label_width = 1;
+  }
+  CHECK(param_.path_imgrec.length() != 0)
+    << "ImageRecordIOIterator: must specify image_rec";
+#if MSHADOW_DIST_PS
+    param_.dist_num_worker = ::ps::RankSize();
+    param_.dist_worker_rank = ::ps::MyRank();
+    LOG(INFO) << "rank " << param_.dist_worker_rank
+              << " in " << param_.dist_num_worker;
+#endif
+  source_ = dmlc::InputSplit::Create
+      (param_.path_imgrec.c_str(), param_.dist_worker_rank,
+       param_.dist_num_worker, "recordio");
+  // use 64 MB chunk when possible
+  source_->HintChunkSize(8 << 20UL);
+}
+
+inline bool ImageRecordIOParser::
+ParseNext(std::vector<InstVector> *out_vec) {
+  CHECK(source_ != NULL);
+  dmlc::InputSplit::Blob chunk;
+  if (!source_->NextChunk(&chunk)) return false;
+  out_vec->resize(param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
+  {
+    CHECK(omp_get_num_threads() == param_.nthread);
+    int tid = omp_get_thread_num();
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread);
+    ImageRecordIO rec;
+    dmlc::InputSplit::Blob blob;
+    // image data
+    InstVector &out = (*out_vec)[tid];
+    out.Clear();
+    while (reader.NextRecord(&blob)) {
+      rec.Load(blob.dptr, blob.size);
+      out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
+               mshadow::Shape1(param_.label_width));
+      DataInst inst = out.Back();
+      // turn datainst into tensor
+      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
+      augmenters_[tid]->Process(rec.content, rec.content_size, &img_, prnds_[tid]);
+      mshadow::Copy(data, img_);
+      if (label_map_ != NULL) {
+        mshadow::Copy(label, label_map_->Find(rec.image_index()));
+      } else {
+        label[0] = rec.header.label;
+      }
+    }
+  }
+  return true;
+}
+
+// Define image record parameters
+struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief random seed */
+  int seed;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief whether to remain silent */
+  bool silent;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecordParam) {
+    DMLC_DECLARE_FIELD(shuffle).set_default(true)
+        .describe("Whether to shuffle data.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Random Seed.");
+    DMLC_DECLARE_FIELD(mean_img).set_default("./data/mean.bin")
+        .describe("Path to image mean file.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to output information.");
+  }
+};
+
+// iterator on image recordio
+class ImageRecordIter : public IIterator<DataInst> {
+ public:
+  ImageRecordIter()
+      : data_(NULL) {
+  }
+  virtual ~ImageRecordIter(void) {
+    iter_.Destroy();
+    // data can be NULL
+    delete data_;
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init image rec param
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // use the kwarg to init parser
+    parser_.Init(kwargs);
+    // init thread iter
+    iter_.set_max_capacity(4);
+    iter_.Init([this](std::vector<InstVector> **dptr) {
+        if (*dptr == NULL) {
+          *dptr = new std::vector<InstVector>();
+        }
+        return parser_.ParseNext(*dptr);
+      },
+      [this]() { parser_.BeforeFirst(); });
+    // Check Meanfile
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        this->CreateMeanImg();
+      } else {
+        delete fi;
+      }
+    }
+    inst_ptr_ = 0;
+  }
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+    inst_order_.clear();
+    inst_ptr_ = 0;
+  }
+  virtual bool Next(void) {
+    while (true) {
+      if (inst_ptr_ < inst_order_.size()) {
+        std::pair<unsigned, unsigned> p = inst_order_[inst_ptr_];
+        out_ = (*data_)[p.first][p.second];
+        ++inst_ptr_;
+        return true;
+      } else {
+        if (data_ != NULL) iter_.Recycle(&data_);
+        if (!iter_.Next(&data_)) return false;
+        inst_order_.clear();
+        for (unsigned i = 0; i < data_->size(); ++i) {
+          const InstVector &tmp = (*data_)[i];
+          for (unsigned j = 0; j < tmp.Size(); ++j) {
+            inst_order_.push_back(std::make_pair(i, j));
+          }
+        }
+        // shuffle instance order if needed
+        if (shuffle_ != 0) {
+            std::shuffle(inst_order_.begin(), inst_order_.end(), \
+                    common::RANDOM_ENGINE(kRandMagic + param_.seed));
+        }
+        inst_ptr_ = 0;
+      }
+    }
+    return false;
+  }
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  inline void CreateMeanImg(void) {
+    if (param_.silent == 0) {
+      printf("cannot find %s: create mean image, this will take some time...\n",
+              param_.mean_img.c_str());
+    }
+    time_t start = time(NULL);
+    uint64_t elapsed = 0;
+    size_t imcnt = 1;
+    this->BeforeFirst();
+    CHECK(this->Next()) << "input iterator failed.";
+    // Get the first data
+    mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+    meanimg_.Resize(img_tensor.shape_);
+    mshadow::Copy(meanimg_, img_tensor);
+    while (this->Next()) {
+      mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+      meanimg_ += img_tensor; imcnt += 1;
+      elapsed = (uint64_t)(time(NULL) - start);
+      if (imcnt % 1000 == 0 && param_.silent == 0) {
+        printf("\r                                                               \r");
+        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
+        fflush(stdout);
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+
+    dmlc::Stream *fo = dmlc::Stream::Create(param_.mean_img.c_str(), "w");
+    meanimg_.SaveBinary(*fo);
+    delete fo;
+    if (param_.silent == 0) {
+      printf("save mean image to %s..\n", param_.mean_img.c_str());
+    }
+  }
+
+  // random magic
+  static const int kRandMagic = 111;
+  // output instance
+  DataInst out_;
+  // whether shuffle data
+  int shuffle_;
+  // data ptr
+  size_t inst_ptr_;
+  // internal instance order
+  std::vector<std::pair<unsigned, unsigned> > inst_order_;
+  // data
+  std::vector<InstVector> *data_;
+  // internal parser
+  ImageRecordIOParser parser_;
+  // backend thread
+  dmlc::ThreadedIter<std::vector<InstVector> > iter_;
+  // parameters
+  ImageRecordParam param_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+};
+DMLC_REGISTER_PARAMETER(ImageRecParserParam);
+DMLC_REGISTER_PARAMETER(ImageRecordParam);
+MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
+    .describe("Create iterator for dataset packed in recordio.")
+    .add_arguments(ImageRecordParam::__FIELDS__())
+    .add_arguments(ImageRecParserParam::__FIELDS__())
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(ImageAugmentParam::__FIELDS__());
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 93195061b278..77ac3a479f75 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -31,7 +31,7 @@ struct MNISTParam : public dmlc::Parameter<MNISTParam> {
   bool flat;
   /*! \brief random seed */
   int seed;
-  // declare parameters in header file
+  // declare parameters
   DMLC_DECLARE_PARAMETER(MNISTParam) {
     DMLC_DECLARE_FIELD(image).set_default("./train-images-idx3-ubyte")
         .describe("Mnist image path.");
@@ -155,7 +155,7 @@ class MNISTIter: public IIterator<DataBatch> {
     delete stdlabel;
   }
   inline void Shuffle(void) {
-    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic+param_.seed));
+    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
     std::vector<float> tmplabel(labels_.size());
     mshadow::TensorContainer<cpu, 3> tmpimg(img_.shape_);
     for (size_t i = 0; i < inst_.size(); ++i) {
diff --git a/tests/python/get_data.py b/tests/python/get_data.py
index 82d25d9072fb..828809f3e757 100644
--- a/tests/python/get_data.py
+++ b/tests/python/get_data.py
@@ -27,3 +27,10 @@ def GetMNIST_ubyte():
         os.system("wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz -P data/")
         os.system("gunzip data/t10k-labels-idx1-ubyte.gz")
 
+# download cifar
+def GetCifar10():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if not os.path.exists('data/cifar10.zip'):
+        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P data/")
+        os.system("unzip data/cifar10.zip")
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index dfeb3f67c293..1156782bdfef 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -5,28 +5,29 @@
 import pickle as pickle
 import sys
 import get_data
+#from PIL import Image
 
-# prepare data
-get_data.GetMNIST_ubyte()
 
-batch_size = 100
-train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
-val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=0, flat=1, silent=0)
+def test_MNISTIter():
+    # prepare data
+    get_data.GetMNIST_ubyte()
 
-def test_MNISTIter_loop():
+    batch_size = 100
+    train_dataiter = mx.io.MNISTIter(
+            image="data/train-images-idx3-ubyte",
+            label="data/train-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
+    val_dataiter = mx.io.MNISTIter(
+            image="data/t10k-images-idx3-ubyte",
+            label="data/t10k-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=0, flat=1, silent=0)
+    # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
     for data, label in train_dataiter:
         batch_count += 1
     assert(nbatch == batch_count)
-
-def test_MNISTIter_reset():
+    # test_reset
     train_dataiter.reset()
     train_dataiter.iter_next()
     label_0 = train_dataiter.getlabel().numpy.flatten()
@@ -39,3 +40,65 @@ def test_MNISTIter_reset():
     label_1 = train_dataiter.getlabel().numpy.flatten()
     assert(sum(label_0 - label_1) == 0)
 
+'''
+def test_ImageRecIter():
+    dataiter = mx.io.ImageRecordIter(
+            path_imgrec="data/val_cxxnet.rec",
+            mean_img="data/smallset/image_net_mean.bin",
+            rand_crop=True,
+            mirror=True,
+            input_shape=(3,227,227),
+            batch_size=100,
+            nthread=1,
+            seed=10)
+    labelcount = [0 for i in range(1000)] 
+    batchcount = 0
+    for data, label in dataiter:
+        npdata = data.numpy
+        print npdata[0,:,:,:]
+        imgdata = np.zeros([227, 227, 3], dtype=np.uint8)
+        imgdata[:,:,0] = npdata[10,2,:,:]
+        imgdata[:,:,1] = npdata[10,1,:,:]
+        imgdata[:,:,2] = npdata[10,0,:,:]
+        img = Image.fromarray(imgdata)
+        imgpath = "data/smallset/test_3.jpg"
+        img.save(imgpath, format='JPEG')
+        exit(0)
+        print batchcount
+        sys.stdout.flush()
+        batchcount += 1
+        nplabel = label.numpy
+        for i in range(nplabel.shape[0]):
+            labelcount[int(nplabel[i])] += 1
+
+def test_Cifar10Rec():
+    dataiter = mx.io.ImageRecordIter(
+            path_imgrec="data/cifar/test.rec",
+            mean_img="data/cifar/cifar10_mean.bin",
+            rand_crop=True,
+            rand_mirror=True,
+            input_shape=(3,28,28),
+            batch_size=100,
+            nthread=1)
+    labelcount = [0 for i in range(10)] 
+    batchcount = 0
+    for data, label in dataiter:
+        npdata = data.numpy
+        print npdata[0,:,:,:]
+        imgdata = np.zeros([28, 28, 3], dtype=np.uint8)
+        imgdata[:,:,0] = npdata[0,2,:,:]
+        imgdata[:,:,1] = npdata[0,1,:,:]
+        imgdata[:,:,2] = npdata[0,0,:,:]
+        img = Image.fromarray(imgdata)
+        imgpath = "data/cifar/test.jpg"
+        img.save(imgpath, format='JPEG')
+        exit(0)
+        print "Batch: ", batchcount
+        sys.stdout.flush()
+        batchcount += 1
+        nplabel = label.numpy
+        for i in range(nplabel.shape[0]):
+            labelcount[int(nplabel[i])] += 1
+    for i in range(10):
+        assert(labelcount[i] == 1000)
+'''