diff --git a/configs/benchmarks/linear_classification/imagenet/r50_last.py b/configs/benchmarks/linear_classification/imagenet/r50_last.py
index 625cdefb..1d476619 100644
--- a/configs/benchmarks/linear_classification/imagenet/r50_last.py
+++ b/configs/benchmarks/linear_classification/imagenet/r50_last.py
@@ -28,15 +28,16 @@ img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 train_pipeline = [
     dict(type='RandomResizedCrop', size=224),
     dict(type='RandomHorizontalFlip'),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
 test_pipeline = [
     dict(type='Resize', size=256),
     dict(type='CenterCrop', size=224),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
+# prefetch
+prefetch = False
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
+    test_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
 data = dict(
     imgs_per_gpu=32,  # total 32*8=256, 8GPU linear cls
     workers_per_gpu=5,
@@ -45,12 +46,14 @@ data = dict(
         data_source=dict(
             list_file=data_train_list, root=data_train_root,
             **data_source_cfg),
-        pipeline=train_pipeline),
+        pipeline=train_pipeline,
+        prefetch=prefetch),
     val=dict(
         type=dataset_type,
         data_source=dict(
             list_file=data_test_list, root=data_test_root, **data_source_cfg),
-        pipeline=test_pipeline))
+        pipeline=test_pipeline,
+        prefetch=prefetch))
 # additional hooks
 custom_hooks = [
     dict(
@@ -60,6 +63,8 @@ custom_hooks = [
         interval=1,
         imgs_per_gpu=128,
         workers_per_gpu=4,
+        prefetch=prefetch,
+        img_norm_cfg=img_norm_cfg,
         eval_param=dict(topk=(1, 5)))
 ]
 # optimizer
diff --git a/configs/benchmarks/linear_classification/imagenet/r50_last_sobel.py b/configs/benchmarks/linear_classification/imagenet/r50_last_sobel.py
index b7d24027..ec4f7723 100644
--- a/configs/benchmarks/linear_classification/imagenet/r50_last_sobel.py
+++ b/configs/benchmarks/linear_classification/imagenet/r50_last_sobel.py
@@ -28,15 +28,16 @@ img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 train_pipeline = [
     dict(type='RandomResizedCrop', size=224),
     dict(type='RandomHorizontalFlip'),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
 test_pipeline = [
     dict(type='Resize', size=256),
     dict(type='CenterCrop', size=224),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
+# prefetch
+prefetch = False
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
+    test_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
 data = dict(
     imgs_per_gpu=32,  # total 32*8=256, 8GPU linear cls
     workers_per_gpu=5,
@@ -45,12 +46,14 @@ data = dict(
         data_source=dict(
             list_file=data_train_list, root=data_train_root,
             **data_source_cfg),
-        pipeline=train_pipeline),
+        pipeline=train_pipeline,
+        prefetch=prefetch),
     val=dict(
         type=dataset_type,
         data_source=dict(
             list_file=data_test_list, root=data_test_root, **data_source_cfg),
-        pipeline=test_pipeline))
+        pipeline=test_pipeline,
+        prefetch=prefetch))
 # additional hooks
 custom_hooks = [
     dict(
@@ -60,6 +63,8 @@ custom_hooks = [
         interval=1,
         imgs_per_gpu=128,
         workers_per_gpu=4,
+        prefetch=prefetch,
+        img_norm_cfg=img_norm_cfg,
         eval_param=dict(topk=(1, 5)))
 ]
 # optimizer
diff --git a/configs/benchmarks/linear_classification/places205/r50_multihead.py b/configs/benchmarks/linear_classification/places205/r50_multihead.py
index 8a61eb6e..8826bb41 100644
--- a/configs/benchmarks/linear_classification/places205/r50_multihead.py
+++ b/configs/benchmarks/linear_classification/places205/r50_multihead.py
@@ -35,15 +35,16 @@ train_pipeline = [
     dict(type='CenterCrop', size=256),
     dict(type='RandomCrop', size=224),
     dict(type='RandomHorizontalFlip'),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
 test_pipeline = [
     dict(type='Resize', size=256),
     dict(type='CenterCrop', size=224),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
+# prefetch
+prefetch = False
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
+    test_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
 data = dict(
     imgs_per_gpu=32,  # total 32x8=256
     workers_per_gpu=4,
@@ -52,12 +53,14 @@ data = dict(
         data_source=dict(
             list_file=data_train_list, root=data_train_root,
             **data_source_cfg),
-        pipeline=train_pipeline),
+        pipeline=train_pipeline,
+        prefetch=prefetch),
     val=dict(
         type=dataset_type,
         data_source=dict(
             list_file=data_test_list, root=data_test_root, **data_source_cfg),
-        pipeline=test_pipeline))
+        pipeline=test_pipeline,
+        prefetch=prefetch))
 # additional hooks
 custom_hooks = [
     dict(
@@ -67,6 +70,8 @@ custom_hooks = [
         interval=10,
         imgs_per_gpu=32,
         workers_per_gpu=4,
+        prefetch=prefetch,
+        img_norm_cfg=img_norm_cfg,
         eval_param=dict(topk=(1, )))
 ]
 # optimizer
diff --git a/configs/benchmarks/linear_classification/places205/r50_multihead_sobel.py b/configs/benchmarks/linear_classification/places205/r50_multihead_sobel.py
index 9dea3f3d..7e5cd869 100644
--- a/configs/benchmarks/linear_classification/places205/r50_multihead_sobel.py
+++ b/configs/benchmarks/linear_classification/places205/r50_multihead_sobel.py
@@ -35,15 +35,16 @@ train_pipeline = [
     dict(type='CenterCrop', size=256),
     dict(type='RandomCrop', size=224),
     dict(type='RandomHorizontalFlip'),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
 test_pipeline = [
     dict(type='Resize', size=256),
     dict(type='CenterCrop', size=224),
-    dict(type='ToTensor'),
-    dict(type='Normalize', **img_norm_cfg),
 ]
+# prefetch
+prefetch = False
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
+    test_pipeline.extend([dict(type='ToTensor'), dict(type='Normalize', **img_norm_cfg)])
 data = dict(
     imgs_per_gpu=32,  # total 32x8=256
     workers_per_gpu=4,
@@ -52,12 +53,14 @@ data = dict(
         data_source=dict(
             list_file=data_train_list, root=data_train_root,
             **data_source_cfg),
-        pipeline=train_pipeline),
+        pipeline=train_pipeline,
+        prefetch=prefetch),
     val=dict(
         type=dataset_type,
         data_source=dict(
             list_file=data_test_list, root=data_test_root, **data_source_cfg),
-        pipeline=test_pipeline))
+        pipeline=test_pipeline,
+        prefetch=prefetch))
 # additional hooks
 custom_hooks = [
     dict(
@@ -67,6 +70,8 @@ custom_hooks = [
         interval=10,
         imgs_per_gpu=32,
         workers_per_gpu=4,
+        prefetch=prefetch,
+        img_norm_cfg=img_norm_cfg,
         eval_param=dict(topk=(1, )))
 ]
 # optimizer
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
index 4efe859d..bb3a56fe 100644
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -92,7 +92,7 @@ prefetch = True
 3 . Replacing  Pillow with Pillow-SIMD (https://github.com/uploadcare/pillow-simd.git) to make use of SIMD command sets with modern CPU.
  ```shell
 pip uninstall pillow
-pip install Pillow-SIMD
+pip install Pillow-SIMD or CC="cc -mavx2" pip install -U --force-reinstall pillow-simd if AVX2 is available.
 ```
 We test it using MoCoV2 using a total batch size of 256 on Tesla V100. The training time per step is decreased to 0.17s from 0.23s.
 ## Benchmarks
diff --git a/openselfsup/datasets/classification.py b/openselfsup/datasets/classification.py
index bfb5174a..cc4094ae 100644
--- a/openselfsup/datasets/classification.py
+++ b/openselfsup/datasets/classification.py
@@ -4,6 +4,7 @@ from openselfsup.utils import print_log
 
 from .registry import DATASETS
 from .base import BaseDataset
+from .utils import to_numpy
 
 
 @DATASETS.register_module
@@ -11,12 +12,14 @@ class ClassificationDataset(BaseDataset):
     """Dataset for classification.
     """
 
-    def __init__(self, data_source, pipeline):
-        super(ClassificationDataset, self).__init__(data_source, pipeline)
+    def __init__(self, data_source, pipeline, prefetch=False):
+        super(ClassificationDataset, self).__init__(data_source, pipeline, prefetch)
 
     def __getitem__(self, idx):
         img, target = self.data_source.get_sample(idx)
         img = self.pipeline(img)
+        if self.prefetch:
+            img = torch.from_numpy(to_numpy(img))
         return dict(img=img, gt_label=target)
 
     def evaluate(self, scores, keyword, logger=None, topk=(1, 5)):
diff --git a/openselfsup/hooks/validate_hook.py b/openselfsup/hooks/validate_hook.py
index 1dabcaa6..b769183a 100644
--- a/openselfsup/hooks/validate_hook.py
+++ b/openselfsup/hooks/validate_hook.py
@@ -42,7 +42,10 @@ class ValidateHook(Hook):
             eval_kwargs['imgs_per_gpu'],
             eval_kwargs['workers_per_gpu'],
             dist=dist_mode,
-            shuffle=False)
+            shuffle=False,
+            prefetch=eval_kwargs.get('prefetch', False),
+            img_norm_cfg=eval_kwargs.get('img_norm_cfg', dict()),
+        )
         self.dist_mode = dist_mode
         self.initial = initial
         self.interval = interval