commit b742b7c18b5826caa4ecc1f066dd281ac9f51a9a
Author: xiaohangzhan <xiaohangzhan@outlook.com>
Date:   Tue Jun 16 00:05:18 2020 +0800

    upload code

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..aedb855e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,128 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+mmdet/version.py
+data
+.vscode
+.idea
+
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+work_dirs/
+pretrains
+pretrains/
+
+# Pytorch
+*.pth
+
+*.swp
+source.sh
+tensorboard.sh
+.DS_Store
+replace.sh
+benchmarks/detection/datasets
+benchmarks/detection/output
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 00000000..286a3f1d
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,4 @@
+[style]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..1a7360a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,107 @@
+
+# OpenSelfSup
+
+## Introduction
+
+The master branch works with **PyTorch 1.1** or higher.
+
+OpenSelfSup is an open source unsupervised representation learning toolbox based on PyTorch.
+
+### What does this repo do?
+
+Below is the relations among Unsupervised Learning, Self-Supervised Learning and Representation Learning. This repo focuses on the shadow area, i.e., Unsupervised Representation Learning. Self-Supervised Representation Learning is the major branch of it. Since in many cases we do not distingush between Self-Supervised Representation Learning and Unsupervised Representation Learning strictly, we still name this repo as `OpenSelfSup`.
+
+<img src="docs/relation.jpg" width="600"/>
+
+### Major features
+
+- **All methods in one repository**
+  
+|                                                                                                                                                       |  Support |
+|-------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
+| [ImageNet](https://link.springer.com/article/10.1007/s11263-015-0816-y?sa_campaign=email/event/articleAuthor/onlineFirst#)                            |     ✓    |
+| [Relative-Loc](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.pdf)      |     ✓    |
+| [Rotation-Pred](https://arxiv.org/abs/1803.07728)                                                                                                     |     ✓    |
+| [DeepCluster](https://arxiv.org/abs/1807.05520)                                                                                                       |     ✓    |
+| [ODC](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhan_Online_Deep_Clustering_for_Unsupervised_Representation_Learning_CVPR_2020_paper.pdf) |     ✓    |
+| [NIPD](https://arxiv.org/abs/1805.01978)                                                                                                              |     ✓    |
+| [MoCo](https://arxiv.org/abs/1911.05722)                                                                                                              |     ✓    |
+| [MoCo v2](https://arxiv.org/abs/2003.04297)                                                                                                           |     ✓    |
+| [SimCLR](https://arxiv.org/abs/2002.05709)                                                                                                            |     ✓    |
+| [PIRL](http://openaccess.thecvf.com/content_CVPR_2020/papers/Misra_Self-Supervised_Learning_of_Pretext-Invariant_Representations_CVPR_2020_paper.pdf) | progress |
+
+- **Flexibility & Extensibility**
+
+OpenSelfSup follows a similar code architecture of MMDetection while is even more flexible than MMDetection, since OpenSelfSup integrates various self-supervised tasks including classification, joint clustering and feature learning, contrastive learning, tasks with a memory bank, etc.
+
+For existing methods in this repo, you only need to modify config files to adjust hyper-parameters. It is also simple to design your own methods, please refer to [GETTING_STARTED](docs/GETTING_STARTED.md).
+
+- **Efficiency**
+
+  All methods support multi-machine multi-gpu distributed training.
+
+- **Standardized Benchmarks**
+
+  We standardize the benchmarks including logistic regression, SVM / Low-shot SVM from linearly probed features, semi-supervised classification, and object detection. Below are the setting of these benchmarks.
+
+| Benchmarks                       | Setting                                                                                                                                                                     | Difference                                      |
+|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|
+| ImageNet Linear Classification   | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Total 90 epochs, decay at [30, 60].             |
+| Places205 Linear Classification  | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Total 90 epochs, decay at [30, 60].             |
+| PASCAL VOC07 SVM                 | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Costs="1.0,10.0,100.0" to save evaluation time. |
+| PASCAL VOC07 Low-shot SVM        | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Costs="1.0,10.0,100.0" to save evaluation time. |
+| PASCAL VOC07+12 Object Detection | [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf)                      |                                                 |
+| COCO17 Object Detection          | [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf)                      |                                                 |
+
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Changelog
+
+v0.1.0 was released in 15/06/2020.
+Please refer to [CHANGELOG.md](docs/CHANGELOG.md) for details and release history.
+
+## Benchmark and model zoo
+
+## Installation
+
+Please refer to [INSTALL.md](docs/INSTALL.md) for installation and dataset preparation.
+
+
+## Get Started
+
+Please see [GETTING_STARTED.md](docs/GETTING_STARTED.md) for the basic usage of OpenSelfSup.
+
+## Contributing
+
+We appreciate all contributions to improve MMDetection. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+
+## Citation
+
+If you use this toolbox or benchmark in your research, please cite this project.
+
+```
+@article{openselfsup,
+  title   = {{OpenSelfSup}: Open MMLab Self-Supervised Learning Toolbox and Benchmark},
+  author  = {Xiaohang Zhan, Jiahao Xie, Ziwei Liu, Dahua Lin, Chen Change Loy},
+  howpublished = {\url{https://github.com/open-mmlab/openselfsup}},
+  year = {2020}
+}
+```
+
+## Acknowledgement
+
+1. This repo borrows the architecture design and part of the code from [MMDetection](https://github.com/open-mmlab/mmdetection).
+
+2. The implementation of MoCo and the detection benchmark borrow the code from [moco](https://github.com/facebookresearch/moco).
+
+3. The SVM benchmark borrows the code from [
+fair_self_supervision_benchmark](https://github.com/facebookresearch/fair_self_supervision_benchmark).
+
+4. `openselfsup/third_party/clustering.py` is borrowed from [deepcluster](https://github.com/facebookresearch/deepcluster/blob/master/clustering.py).
+
+## Contact
+
+This repo is currently maintained by Xiaohang Zhan ([@XiaohangZhan](http://github.com/XiaohangZhan)).
diff --git a/benchmarks/detection/README.md b/benchmarks/detection/README.md
new file mode 100644
index 00000000..caeb7ae3
--- /dev/null
+++ b/benchmarks/detection/README.md
@@ -0,0 +1,12 @@
+
+## Transferring to Detection
+
+We follow the evaluation setting in MoCo when trasferring to object detection.
+
+### Instruction
+
+1. Install [detectron2](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md).
+
+1. Put dataset under "benchmarks/detection/datasets" directory,
+   following the [directory structure](https://github.com/facebookresearch/detectron2/tree/master/datasets)
+	 requried by detectron2.
diff --git a/benchmarks/detection/configs/Base-RCNN-C4-BN.yaml b/benchmarks/detection/configs/Base-RCNN-C4-BN.yaml
new file mode 100644
index 00000000..5104c6a6
--- /dev/null
+++ b/benchmarks/detection/configs/Base-RCNN-C4-BN.yaml
@@ -0,0 +1,17 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RPN:
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "Res5ROIHeadsExtraNorm"
+  BACKBONE:
+    FREEZE_AT: 0
+  RESNETS:
+    NORM: "SyncBN"
+TEST:
+  PRECISE_BN:
+    ENABLED: True
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
diff --git a/benchmarks/detection/configs/coco_R_50_C4_2x.yaml b/benchmarks/detection/configs/coco_R_50_C4_2x.yaml
new file mode 100644
index 00000000..5b7e4240
--- /dev/null
+++ b/benchmarks/detection/configs/coco_R_50_C4_2x.yaml
@@ -0,0 +1,13 @@
+_BASE_: "Base-RCNN-C4-BN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
diff --git a/benchmarks/detection/configs/coco_R_50_C4_2x_moco.yaml b/benchmarks/detection/configs/coco_R_50_C4_2x_moco.yaml
new file mode 100644
index 00000000..8e310683
--- /dev/null
+++ b/benchmarks/detection/configs/coco_R_50_C4_2x_moco.yaml
@@ -0,0 +1,10 @@
+_BASE_: "coco_R_50_C4_2x.yaml"
+MODEL:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  WEIGHTS: "See Instructions"
+  RESNETS:
+    STRIDE_IN_1X1: False
+INPUT:
+  MAX_SIZE_TRAIN: 1200
+  FORMAT: "RGB"
diff --git a/benchmarks/detection/configs/pascal_voc_R_50_C4_24k.yaml b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k.yaml
new file mode 100644
index 00000000..a05eb5e2
--- /dev/null
+++ b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k.yaml
@@ -0,0 +1,16 @@
+_BASE_: "Base-RCNN-C4-BN.yaml"
+MODEL:
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (18000, 22000)
+  MAX_ITER: 24000
+  WARMUP_ITERS: 100
diff --git a/benchmarks/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml
new file mode 100644
index 00000000..eebe6905
--- /dev/null
+++ b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml
@@ -0,0 +1,9 @@
+_BASE_: "pascal_voc_R_50_C4_24k.yaml"
+MODEL:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  WEIGHTS: "See Instructions"
+  RESNETS:
+    STRIDE_IN_1X1: False
+INPUT:
+  FORMAT: "RGB"
diff --git a/benchmarks/detection/convert-pretrain-to-detectron2.py b/benchmarks/detection/convert-pretrain-to-detectron2.py
new file mode 100755
index 00000000..e8bf5434
--- /dev/null
+++ b/benchmarks/detection/convert-pretrain-to-detectron2.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle as pkl
+import sys
+import torch
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+
+    obj = torch.load(input, map_location="cpu")
+    obj = obj["state_dict"]
+
+    newmodel = {}
+    for k, v in obj.items():
+        old_k = k
+        if "layer" not in k:
+            k = "stem." + k
+        for t in [1, 2, 3, 4]:
+            k = k.replace("layer{}".format(t), "res{}".format(t + 1))
+        for t in [1, 2, 3]:
+            k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
+        k = k.replace("downsample.0", "shortcut")
+        k = k.replace("downsample.1", "shortcut.norm")
+        print(old_k, "->", k)
+        newmodel[k] = v.numpy()
+
+    res = {
+        "model": newmodel,
+        "__author__": "OpenSelfSup",
+        "matching_heuristics": True
+    }
+
+    assert sys.argv[2].endswith('.pkl')
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
diff --git a/benchmarks/detection/run.sh b/benchmarks/detection/run.sh
new file mode 100644
index 00000000..2b35e59d
--- /dev/null
+++ b/benchmarks/detection/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+DET_CFG=$1
+WEIGHTS=$2
+
+python $(dirname "$0")/train_net.py --config-file $DET_CFG \
+    --num-gpus 8 MODEL.WEIGHTS $WEIGHTS
diff --git a/benchmarks/detection/train_net.py b/benchmarks/detection/train_net.py
new file mode 100755
index 00000000..8ae31c9e
--- /dev/null
+++ b/benchmarks/detection/train_net.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import os
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, PascalVOCDetectionEvaluator
+from detectron2.layers import get_norm
+from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
+
+
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeadsExtraNorm(Res5ROIHeads):
+    """
+    As described in the MOCO paper, there is an extra BN layer
+    following the res5 stage.
+    """
+
+    def _build_res5_block(self, cfg):
+        seq, out_channels = super()._build_res5_block(cfg)
+        norm = cfg.MODEL.RESNETS.NORM
+        norm = get_norm(norm, out_channels)
+        seq.add_module("norm", norm)
+        return seq, out_channels
+
+
+class Trainer(DefaultTrainer):
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        if "coco" in dataset_name:
+            return COCOEvaluator(dataset_name, cfg, True, output_folder)
+        else:
+            assert "voc" in dataset_name
+            return PascalVOCDetectionEvaluator(dataset_name)
+
+
+def setup(args):
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(
+            model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+                cfg.MODEL.WEIGHTS, resume=args.resume)
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args, ),
+    )
diff --git a/benchmarks/dist_test_cls.sh b/benchmarks/dist_test_cls.sh
new file mode 100755
index 00000000..dd0a5ab2
--- /dev/null
+++ b/benchmarks/dist_test_cls.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+CFG=$1
+EPOCH=$2
+DATASET=$3 # imagenet or places205
+GPUS=${GPUS:-1}
+PORT=${PORT:-29500}
+PY_ARGS=${@:4}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/${DATASET}_at_epoch_${EPOCH}/
+
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    tools/train.py \
+    configs/linear_classifier/${DATASET}/r50_multihead.py \
+    --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+    --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="pytorch" ${PY_ARGS}
+
+# test
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    tools/test.py \
+    configs/linear_classifier/${DATASET}/r50_multihead.py \
+    ${WORK_DIR_EVAL}/latest.pth \
+    --work_dir ${WORK_DIR_EVAL} --launcher="pytorch"
diff --git a/benchmarks/dist_test_svm.sh b/benchmarks/dist_test_svm.sh
new file mode 100644
index 00000000..82fa67aa
--- /dev/null
+++ b/benchmarks/dist_test_svm.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+set -x
+
+CFG=$1
+EPOCH=$2
+FEAT_LIST=$3
+GPUS=$4
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+bash tools/dist_extract.sh $CFG $WORK_DIR/epoch_${EPOCH}.pth $GPUS
+
+bash benchmarks/eval_svm.sh $WORK_DIR $FEAT_LIST
+
+bash benchmarks/eval_svm_lowshot.sh $WORK_DIR $FEAT_LIST
diff --git a/benchmarks/eval_svm.sh b/benchmarks/eval_svm.sh
new file mode 100644
index 00000000..8e8ca478
--- /dev/null
+++ b/benchmarks/eval_svm.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+WORK_DIR=$1
+FEAT_LIST=${2:-"feat5"} # "feat1 feat2 feat3 feat4 feat5"
+TRAIN_SVM_FLAG=true
+TEST_SVM_FLAG=true
+DATA="data/VOCdevkit/VOC2007/SVMLabels"
+
+# config svm
+costs="1.0,10.0,100.0"
+
+mkdir $WORK_DIR/logs
+for feat in $FEAT_LIST; do
+    echo "For feature: $feat" 2>&1 | tee -a $WORK_DIR/logs/eval_svm.log
+    # train svm
+    if $TRAIN_SVM_FLAG; then
+        rm -rf $WORK_DIR/svm
+        mkdir -p $WORK_DIR/svm/voc07_${feat}
+        echo "training svm ..."
+        python benchmarks/svm_tools/train_svm_kfold_parallel.py \
+            --data_file $WORK_DIR/features/voc07_trainval_${feat}.npy \
+            --targets_data_file $DATA/train_labels.npy \
+            --costs_list $costs \
+            --output_path $WORK_DIR/svm/voc07_${feat}
+    fi
+    
+    # test svm
+    if $TEST_SVM_FLAG; then
+        echo "testing svm ..."
+        python benchmarks/svm_tools/test_svm.py \
+            --data_file $WORK_DIR/features/voc07_test_${feat}.npy \
+            --json_targets $DATA/test_targets.json \
+            --targets_data_file $DATA/test_labels.npy \
+            --costs_list $costs \
+            --generate_json 1 \
+            --output_path $WORK_DIR/svm/voc07_${feat} 2>&1 | tee -a $WORK_DIR/logs/eval_svm.log
+    fi
+
+done
diff --git a/benchmarks/eval_svm_lowshot.sh b/benchmarks/eval_svm_lowshot.sh
new file mode 100644
index 00000000..8f16be4d
--- /dev/null
+++ b/benchmarks/eval_svm_lowshot.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+WORK_DIR=$1
+MODE="full"
+FEAT_LIST=${2:-"feat5"} # "feat1 feat2 feat3 feat4 feat5"
+TRAIN_SVM_LOWSHOT_FLAG=true
+TEST_SVM_LOWSHOT_FLAG=true
+AGGREGATE_FLAG=true
+DATA="data/VOCdevkit/VOC2007/SVMLabels"
+
+# config svm
+costs="1.0,10.0,100.0"
+if [ "$MODE" == "fast" ]; then
+    shots="96"
+else
+    shots="1 2 4 8 16 32 64 96"
+fi
+
+mkdir $WORK_DIR/logs
+for feat in $FEAT_LIST; do
+    echo "For feature: $feat" 2>&1 | tee -a $WORK_DIR/logs/eval_svm_lowshot.log
+    # train lowshot svm
+    if $TRAIN_SVM_LOWSHOT_FLAG; then
+        rm -rf $WORK_DIR/svm_lowshot
+        mkdir -p $WORK_DIR/svm_lowshot/voc07_${feat}
+        echo "training svm low-shot ..."
+        for s in {1..5}; do
+            for k in $shots; do
+                echo -e "\ts${s} k${k}"
+                python benchmarks/svm_tools/train_svm_low_shot.py \
+                    --data_file $WORK_DIR/features/voc07_trainval_${feat}.npy \
+                    --targets_data_file $DATA/low_shot/labels/train_targets_sample${s}_k${k}.npy \
+                    --costs_list $costs \
+                    --output_path $WORK_DIR/svm_lowshot/voc07_${feat}
+            done
+        done
+    fi
+    
+    # test lowshot svm
+    if $TEST_SVM_LOWSHOT_FLAG; then
+        echo "testing svm low-shot ..."
+        python benchmarks/svm_tools/test_svm_low_shot.py \
+            --data_file $WORK_DIR/features/voc07_test_${feat}.npy \
+            --targets_data_file $DATA/test_labels.npy \
+            --json_targets $DATA/test_targets.json \
+            --generate_json 1 \
+            --costs_list $costs \
+            --output_path $WORK_DIR/svm_lowshot/voc07_${feat} \
+            --k_values "${shots// /,}" \
+            --sample_inds "0,1,2,3,4" \
+            --dataset "voc"
+    fi
+    
+    # aggregate testing results
+    if $AGGREGATE_FLAG; then
+        echo "aggregating svm low-shot ..."
+        python benchmarks/svm_tools/aggregate_low_shot_svm_stats.py \
+            --output_path $WORK_DIR/svm_lowshot/voc07_${feat} \
+            --k_values "${shots// /,}" \
+            --sample_inds "0,1,2,3,4" 2>&1 | tee -a $WORK_DIR/logs/eval_svm_lowshot.log
+    fi
+
+done
diff --git a/benchmarks/extract_info/voc07.py b/benchmarks/extract_info/voc07.py
new file mode 100644
index 00000000..2680b198
--- /dev/null
+++ b/benchmarks/extract_info/voc07.py
@@ -0,0 +1,20 @@
+data_source_cfg = dict(type='ImageList', memcached=False, mclient_path=None)
+data_root = "data/VOCdevkit/VOC2007/JPEGImages"
+data_all_list = "data/VOCdevkit/VOC2007/Lists/trainvaltest.txt"
+split_at = [5011]
+split_name = ['voc07_trainval', 'voc07_test']
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+data = dict(
+    imgs_per_gpu=32,
+    workers_per_gpu=2,
+    extract=dict(
+        type="ExtractDataset",
+        data_source=dict(
+            list_file=data_all_list, root=data_root, **data_source_cfg),
+        pipeline=[
+            dict(type='Resize', size=256),
+            dict(type='Resize', size=(224, 224)),
+            dict(type='ToTensor'),
+            dict(type='Normalize', **img_norm_cfg),
+        ]))
diff --git a/benchmarks/srun_test_cls.sh b/benchmarks/srun_test_cls.sh
new file mode 100644
index 00000000..472a97a0
--- /dev/null
+++ b/benchmarks/srun_test_cls.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+DATASET=$4 # imagenet or places205
+PY_ARGS=${@:5}
+JOB_NAME="openselfsup"
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/${DATASET}_at_epoch_${EPOCH}/
+
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    srun -p ${PARTITION} \
+        python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py \
+        configs/linear_classifier/${DATASET}/r50_multihead.py \
+        --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+        --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="slurm" ${PY_ARGS}
+
+# test
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py \
+        configs/linear_classifier/${DATASET}/r50_multihead.py \
+        ${WORK_DIR_EVAL}/latest.pth \
+        --work_dir ${WORK_DIR_EVAL} --launcher="slurm"
diff --git a/benchmarks/srun_test_semi.sh b/benchmarks/srun_test_semi.sh
new file mode 100644
index 00000000..713955ab
--- /dev/null
+++ b/benchmarks/srun_test_semi.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+PERCENT=$4
+PY_ARGS=${@:5}
+JOB_NAME="openselfsup"
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/imagenet_semi_${PERCENT}percent_at_epoch_${EPOCH}/
+
+if [ ! "$PERCENT" == "1" ] && [ ! "$PERCENT" == 10 ]; then
+    echo "ERROR: PERCENT must in {1, 10}"
+    exit
+fi
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    srun -p ${PARTITION} \
+        python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py \
+        configs/semisup_classification/imagenet_${PERCENT}percent/r50.py \
+        --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+        --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="slurm" ${PY_ARGS}
+
+# test
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py \
+        configs/semisup_classification/imagenet_${PERCENT}percent/r50.py \
+        ${WORK_DIR_EVAL}/latest.pth \
+        --work_dir ${WORK_DIR_EVAL} --launcher="slurm"
diff --git a/benchmarks/srun_test_svm.sh b/benchmarks/srun_test_svm.sh
new file mode 100644
index 00000000..a6e817bf
--- /dev/null
+++ b/benchmarks/srun_test_svm.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+FEAT=$4
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+bash tools/srun_extract.sh $PARTITION $CFG $WORK_DIR/epoch_${EPOCH}.pth
+
+srun -p $PARTITION bash benchmarks/eval_svm.sh $WORK_DIR $FEAT
+
+srun -p $PARTITION bash benchmarks/eval_svm.sh $WORK_DIR $FEAT
diff --git a/benchmarks/svm_tools/aggregate_low_shot_svm_stats.py b/benchmarks/svm_tools/aggregate_low_shot_svm_stats.py
new file mode 100644
index 00000000..b797e0c2
--- /dev/null
+++ b/benchmarks/svm_tools/aggregate_low_shot_svm_stats.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Aggregate the stats over various independent samples for low-shot svm training.
+Stats computed: mean, max, min, std
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import sys
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def save_stats(output_dir, stat, output):
+    out_file = os.path.join(output_dir, 'test_ap_{}.npy'.format(stat))
+    logger.info('Saving {} to: {} {}'.format(stat, out_file, output.shape))
+    np.save(out_file, output)
+
+
+def aggregate_stats(opts):
+    k_values = [int(val) for val in opts.k_values.split(",")]
+    sample_inds = [int(val) for val in opts.sample_inds.split(",")]
+    logger.info(
+        'Aggregating stats for k-values: {} and sample_inds: {}'.format(
+            k_values, sample_inds))
+
+    output_mean, output_max, output_min, output_std = [], [], [], []
+    for k_idx in range(len(k_values)):
+        k_low = k_values[k_idx]
+        k_val_output = []
+        for inds in range(len(sample_inds)):
+            sample_idx = sample_inds[inds]
+            file_name = 'test_ap_sample{}_k{}.npy'.format(
+                sample_idx + 1, k_low)
+            filepath = os.path.join(opts.output_path, file_name)
+            if os.path.exists(filepath):
+                k_val_output.append(np.load(filepath, encoding='latin1'))
+            else:
+                logger.info('file does not exist: {}'.format(filepath))
+        # import pdb; pdb.set_trace()
+        k_val_output = np.concatenate(k_val_output, axis=0)
+        k_low_max = np.max(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_min = np.min(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_mean = np.mean(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_std = np.std(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        output_mean.append(k_low_mean)
+        output_min.append(k_low_min)
+        output_max.append(k_low_max)
+        output_std.append(k_low_std)
+
+    output_mean = np.concatenate(output_mean, axis=0)
+    output_min = np.concatenate(output_min, axis=0)
+    output_max = np.concatenate(output_max, axis=0)
+    output_std = np.concatenate(output_std, axis=0)
+
+    save_stats(opts.output_path, 'mean', output_mean)
+    save_stats(opts.output_path, 'min', output_min)
+    save_stats(opts.output_path, 'max', output_max)
+    save_stats(opts.output_path, 'std', output_std)
+
+    argmax_cls = np.argmax(output_mean, axis=1)
+    argmax_mean, argmax_min, argmax_max, argmax_std = [], [], [], []
+    for idx in range(len(argmax_cls)):
+        argmax_mean.append(100.0 * output_mean[idx, argmax_cls[idx]])
+        argmax_min.append(100.0 * output_min[idx, argmax_cls[idx]])
+        argmax_max.append(100.0 * output_max[idx, argmax_cls[idx]])
+        argmax_std.append(100.0 * output_std[idx, argmax_cls[idx]])
+    for idx in range(len(argmax_max)):
+        logger.info('mean/min/max/std: {} / {} / {} / {}'.format(
+            round(argmax_mean[idx], 2),
+            round(argmax_min[idx], 2),
+            round(argmax_max[idx], 2),
+            round(argmax_std[idx], 2),
+        ))
+    logger.info('All done!!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low shot SVM model test')
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="Numpy file containing test AP result files")
+    parser.add_argument(
+        '--k_values',
+        type=str,
+        default=None,
+        help="Low-shot k-values for svm testing. Comma separated")
+    parser.add_argument(
+        '--sample_inds',
+        type=str,
+        default=None,
+        help="sample_inds for which to test svm. Comma separated")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    aggregate_stats(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/svm_helper.py b/benchmarks/svm_tools/svm_helper.py
new file mode 100644
index 00000000..1792d5ea
--- /dev/null
+++ b/benchmarks/svm_tools/svm_helper.py
@@ -0,0 +1,171 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Helper module for svm training and testing.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import logging
+import numpy as np
+import os
+import sys
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+# Python 2 and python 3 have different floating point precision. The following
+# trick helps keep the backwards compatibility.
+def py2_py3_compatible_cost(cost):
+    return str(float("{:.17f}".format(cost)))
+
+
+def get_svm_train_output_files(cls, cost, output_path):
+    cls_cost = str(cls) + '_cost' + py2_py3_compatible_cost(cost)
+    out_file = os.path.join(output_path, 'cls' + cls_cost + '.pickle')
+    ap_matrix_out_file = os.path.join(output_path,
+                                      'AP_cls' + cls_cost + '.npy')
+    return out_file, ap_matrix_out_file
+
+
+def parse_cost_list(costs):
+    costs_list = [float(cost) for cost in costs.split(",")]
+    start_num, end_num = 4, 20
+    for num in range(start_num, end_num):
+        costs_list.append(0.5**num)
+    return costs_list
+
+
+def normalize_features(features):
+    feats_norm = np.linalg.norm(features, axis=1)
+    features = features / (feats_norm + 1e-5)[:, np.newaxis]
+    return features
+
+
+def load_input_data(data_file, targets_file):
+    # load the features and the targets
+    #logger.info('loading features and targets...')
+    targets = np.load(targets_file, encoding='latin1')
+    features = np.array(np.load(data_file,
+                                encoding='latin1')).astype(np.float64)
+    assert features.shape[0] == targets.shape[0], "Mismatched #images"
+    logger.info('Loaded features: {} and targets: {}'.format(
+        features.shape, targets.shape))
+    return features, targets
+
+
+def calculate_ap(rec, prec):
+    """
+    Computes the AP under the precision recall curve.
+    """
+    rec, prec = rec.reshape(rec.size, 1), prec.reshape(prec.size, 1)
+    z, o = np.zeros((1, 1)), np.ones((1, 1))
+    mrec, mpre = np.vstack((z, rec, o)), np.vstack((z, prec, z))
+    for i in range(len(mpre) - 2, -1, -1):
+        mpre[i] = max(mpre[i], mpre[i + 1])
+
+    indices = np.where(mrec[1:] != mrec[0:-1])[0] + 1
+    ap = 0
+    for i in indices:
+        ap = ap + (mrec[i] - mrec[i - 1]) * mpre[i]
+    return ap
+
+
+def get_precision_recall(targets, preds):
+    """
+    [P, R, score, ap] = get_precision_recall(targets, preds)
+    Input    :
+        targets  : number of occurrences of this class in the ith image
+        preds    : score for this image
+    Output   :
+        P, R   : precision and recall
+        score  : score which corresponds to the particular precision and recall
+        ap     : average precision
+    """
+    # binarize targets
+    targets = np.array(targets > 0, dtype=np.float32)
+    tog = np.hstack((targets[:, np.newaxis].astype(np.float64),
+                     preds[:, np.newaxis].astype(np.float64)))
+    ind = np.argsort(preds)
+    ind = ind[::-1]
+    score = np.array([tog[i, 1] for i in ind])
+    sortcounts = np.array([tog[i, 0] for i in ind])
+
+    tp = sortcounts
+    fp = sortcounts.copy()
+    for i in range(sortcounts.shape[0]):
+        if sortcounts[i] >= 1:
+            fp[i] = 0.
+        elif sortcounts[i] < 1:
+            fp[i] = 1.
+    P = np.cumsum(tp) / (np.cumsum(tp) + np.cumsum(fp))
+    numinst = np.sum(targets)
+    R = np.cumsum(tp) / numinst
+    ap = calculate_ap(R, P)
+    return P, R, score, ap
+
+
+def get_low_shot_output_file(opts, cls, cost, suffix):
+    # in case of low-shot training, we train for 5 independent samples
+    # (sample{}) and vary low-shot amount (k{}). The input data should have
+    # sample{}_k{} information that we extract in suffix below.
+    # logger.info('Suffix: {}'.format(suffix))
+    cls_cost = str(cls) + '_cost' + py2_py3_compatible_cost(cost)
+    out_file = os.path.join(opts.output_path,
+                            'cls' + cls_cost + '_' + suffix + '.pickle')
+    return out_file
+
+
+def get_low_shot_svm_classes(targets, dataset):
+    # classes for which SVM testing should be done
+    num_classes, cls_list = None, None
+    if dataset == 'voc':
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+    elif dataset == 'places':
+        # each image in places has a target cls [0, .... ,204]
+        num_classes = len(set(targets[:, 0].tolist()))
+        cls_list = list(set(targets[:, 0].tolist()))
+    else:
+        logger.info('Dataset not recognized. Abort!')
+    logger.info('Testing SVM for classes: {}'.format(cls_list))
+    logger.info('Num classes: {}'.format(num_classes))
+    return num_classes, cls_list
+
+
+def get_cls_feats_labels(cls, features, targets, dataset):
+    out_feats, out_cls_labels = None, None
+    if dataset == 'voc':
+        cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        # find the indices for positive/negative imgs. Remove the ignore label.
+        out_data_inds = (targets[:, cls] != -1)
+        out_feats = features[out_data_inds]
+        out_cls_labels = cls_labels[out_data_inds]
+        # label 0 = not present, set it to -1 as svm train target.
+        # Make the svm train target labels as -1, 1.
+        out_cls_labels[np.where(out_cls_labels == 0)] = -1
+    elif dataset == 'places':
+        out_feats = features
+        out_cls_labels = targets.astype(dtype=np.int32, copy=True)
+        # for the given class, get the relevant positive/negative images and
+        # make the label 1, -1
+        cls_inds = np.where(targets[:, 0] == cls)
+        non_cls_inds = (targets[:, 0] != cls)
+        out_cls_labels[non_cls_inds] = -1
+        out_cls_labels[cls_inds] = 1
+        # finally reshape into the format taken by sklearn svm package.
+        out_cls_labels = out_cls_labels.reshape(-1)
+    else:
+        raise Exception('args.dataset not recognized')
+    return out_feats, out_cls_labels
diff --git a/benchmarks/svm_tools/test_svm.py b/benchmarks/svm_tools/test_svm.py
new file mode 100644
index 00000000..854ec175
--- /dev/null
+++ b/benchmarks/svm_tools/test_svm.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM test for image classification.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import numpy as np
+import os
+import pickle
+import six
+import sys
+
+import svm_helper
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def get_chosen_costs(opts, num_classes):
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    train_ap_matrix = np.zeros((num_classes, len(costs_list)))
+    for cls in range(num_classes):
+        for cost_idx in range(len(costs_list)):
+            cost = costs_list[cost_idx]
+            _, ap_out_file = svm_helper.get_svm_train_output_files(
+                cls, cost, opts.output_path)
+            train_ap_matrix[cls][cost_idx] = float(
+                np.load(ap_out_file, encoding='latin1')[0])
+    argmax_cls = np.argmax(train_ap_matrix, axis=1)
+    chosen_cost = [costs_list[idx] for idx in argmax_cls]
+    logger.info('chosen_cost: {}'.format(chosen_cost))
+    np.save(
+        os.path.join(opts.output_path, 'crossval_ap.npy'),
+        np.array(train_ap_matrix))
+    np.save(
+        os.path.join(opts.output_path, 'chosen_cost.npy'),
+        np.array(chosen_cost))
+    logger.info('saved crossval_ap AP to file: {}'.format(
+        os.path.join(opts.output_path, 'crossval_ap.npy')))
+    logger.info('saved chosen costs to file: {}'.format(
+        os.path.join(opts.output_path, 'chosen_cost.npy')))
+    return np.array(chosen_cost)
+
+
+def load_json(file_path):
+    assert os.path.exists(file_path), "{} does not exist".format(file_path)
+    with open(file_path, 'r') as fp:
+        data = json.load(fp)
+    img_ids = list(data.keys())
+    cls_names = list(data[img_ids[0]].keys())
+    return img_ids, cls_names
+
+
+def test_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    json_predictions, img_ids, cls_names = {}, [], []
+    if opts.generate_json:
+        img_ids, cls_names = load_json(opts.json_targets)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+    num_classes = targets.shape[1]
+    logger.info('Num classes: {}'.format(num_classes))
+
+    # get the chosen cost that maximizes the cross-validation AP per class
+    costs_list = get_chosen_costs(opts, num_classes)
+
+    ap_matrix = np.zeros((num_classes, 1))
+    for cls in range(num_classes):
+        cost = costs_list[cls]
+        logger.info('Testing model for cls: {} cost: {}'.format(cls, cost))
+        model_file = os.path.join(
+            opts.output_path,
+            'cls' + str(cls) + '_cost' + str(cost) + '.pickle')
+        with open(model_file, 'rb') as fopen:
+            if six.PY2:
+                model = pickle.load(fopen)
+            else:
+                model = pickle.load(fopen, encoding='latin1')
+        prediction = model.decision_function(features)
+        if opts.generate_json:
+            cls_name = cls_names[cls]
+            for idx in range(len(prediction)):
+                img_id = img_ids[idx]
+                if img_id in json_predictions:
+                    json_predictions[img_id][cls_name] = prediction[idx]
+                else:
+                    out_lbl = {}
+                    out_lbl[cls_name] = prediction[idx]
+                    json_predictions[img_id] = out_lbl
+
+        cls_labels = targets[:, cls]
+        # meaning of labels in VOC/COCO original loaded target files:
+        # label 0 = not present, set it to -1 as svm train target
+        # label 1 = present. Make the svm train target labels as -1, 1.
+        evaluate_data_inds = (targets[:, cls] != -1)
+        eval_preds = prediction[evaluate_data_inds]
+        eval_cls_labels = cls_labels[evaluate_data_inds]
+        eval_cls_labels[np.where(eval_cls_labels == 0)] = -1
+        P, R, score, ap = svm_helper.get_precision_recall(
+            eval_cls_labels, eval_preds)
+        ap_matrix[cls][0] = ap
+    if opts.generate_json:
+        output_file = os.path.join(opts.output_path, 'json_preds.json')
+        with open(output_file, 'w') as fp:
+            json.dump(json_predictions, fp)
+        logger.info('Saved json predictions to: {}'.format(output_file))
+    logger.info('Mean AP: {}'.format(np.mean(ap_matrix, axis=0)))
+    np.save(os.path.join(opts.output_path, 'test_ap.npy'), np.array(ap_matrix))
+    logger.info('saved test AP to file: {}'.format(
+        os.path.join(opts.output_path, 'test_ap.npy')))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model test')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features and labels")
+    parser.add_argument(
+        '--json_targets',
+        type=str,
+        default=None,
+        help="Json file containing json targets")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where trained SVM models are saved")
+    parser.add_argument(
+        '--generate_json',
+        type=int,
+        default=0,
+        help="Whether to generate json files for output")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    test_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/test_svm_low_shot.py b/benchmarks/svm_tools/test_svm_low_shot.py
new file mode 100644
index 00000000..69475906
--- /dev/null
+++ b/benchmarks/svm_tools/test_svm_low_shot.py
@@ -0,0 +1,212 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM test for low shot image classification.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import numpy as np
+import os
+import pickle
+import six
+import sys
+
+import svm_helper
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def load_json(file_path):
+    assert os.path.exists(file_path), "{} does not exist".format(file_path)
+    with open(file_path, 'r') as fp:
+        data = json.load(fp)
+    img_ids = list(data.keys())
+    cls_names = list(data[img_ids[0]].keys())
+    return img_ids, cls_names
+
+
+def save_json_predictions(opts, cost, sample_idx, k_low, features, cls_list,
+                          cls_names, img_ids):
+    num_classes = len(cls_list)
+    json_predictions = {}
+    for cls in range(num_classes):
+        suffix = 'sample{}_k{}'.format(sample_idx + 1, k_low)
+        model_file = svm_helper.get_low_shot_output_file(
+            opts, cls, cost, suffix)
+        with open(model_file, 'rb') as fopen:
+            if six.PY2:
+                model = pickle.load(fopen)
+            else:
+                model = pickle.load(fopen, encoding='latin1')
+        prediction = model.decision_function(features)
+        cls_name = cls_names[cls]
+        for idx in range(len(prediction)):
+            img_id = img_ids[idx]
+            if img_id in json_predictions:
+                json_predictions[img_id][cls_name] = prediction[idx]
+            else:
+                out_lbl = {}
+                out_lbl[cls_name] = prediction[idx]
+                json_predictions[img_id] = out_lbl
+
+    output_file = os.path.join(opts.output_path,
+                               'test_{}_json_preds.json'.format(suffix))
+    with open(output_file, 'w') as fp:
+        json.dump(json_predictions, fp)
+    logger.info('Saved json predictions to: {}'.format(output_file))
+
+
+def test_svm_low_shot(opts):
+    k_values = [int(val) for val in opts.k_values.split(",")]
+    sample_inds = [int(val) for val in opts.sample_inds.split(",")]
+    logger.info('Testing svm for k-values: {} and sample_inds: {}'.format(
+        k_values, sample_inds))
+
+    img_ids, cls_names = [], []
+    if opts.generate_json:
+        img_ids, cls_names = load_json(opts.json_targets)
+
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    # we test the svms on the full test set. Given the test features and the
+    # targets, we test it for various k-values (low-shot), cost values and
+    # 5 independent samples.
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    logger.info('Testing SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    # create the output for per sample, per k-value and per cost.
+    sample_ap_matrices = []
+    for _ in range(len(sample_inds)):
+        ap_matrix = np.zeros((len(k_values), len(costs_list)))
+        sample_ap_matrices.append(ap_matrix)
+
+    # the test goes like this: For a given sample, for a given k-value and a
+    # given cost value, we evaluate the trained svm model for all classes.
+    # After computing over all classes, we get the mean AP value over all
+    # classes. We hence end up with: output = [sample][k_value][cost]
+    for inds in range(len(sample_inds)):
+        sample_idx = sample_inds[inds]
+        for k_idx in range(len(k_values)):
+            k_low = k_values[k_idx]
+            suffix = 'sample{}_k{}'.format(sample_idx + 1, k_low)
+            for cost_idx in range(len(costs_list)):
+                cost = costs_list[cost_idx]
+                local_cost_ap = np.zeros((num_classes, 1))
+                for cls in cls_list:
+                    logger.info(
+                        'Test sample/k_value/cost/cls: {}/{}/{}/{}'.format(
+                            sample_idx + 1, k_low, cost, cls))
+                    model_file = svm_helper.get_low_shot_output_file(
+                        opts, cls, cost, suffix)
+                    with open(model_file, 'rb') as fopen:
+                        if six.PY2:
+                            model = pickle.load(fopen)
+                        else:
+                            model = pickle.load(fopen, encoding='latin1')
+                    prediction = model.decision_function(features)
+                    eval_preds, eval_cls_labels = svm_helper.get_cls_feats_labels(
+                        cls, prediction, targets, opts.dataset)
+                    P, R, score, ap = svm_helper.get_precision_recall(
+                        eval_cls_labels, eval_preds)
+                    local_cost_ap[cls][0] = ap
+                mean_cost_ap = np.mean(local_cost_ap, axis=0)
+                sample_ap_matrices[inds][k_idx][cost_idx] = mean_cost_ap
+            out_k_sample_file = os.path.join(
+                opts.output_path,
+                'test_ap_sample{}_k{}.npy'.format(sample_idx + 1, k_low))
+            save_data = sample_ap_matrices[inds][k_idx]
+            save_data = save_data.reshape((1, -1))
+            np.save(out_k_sample_file, save_data)
+            logger.info('Saved sample test k_idx AP to file: {} {}'.format(
+                out_k_sample_file, save_data.shape))
+            if opts.generate_json:
+                argmax_cls = np.argmax(save_data, axis=1)
+                chosen_cost = costs_list[argmax_cls[0]]
+                logger.info('chosen cost: {}'.format(chosen_cost))
+                save_json_predictions(opts, chosen_cost, sample_idx, k_low,
+                                      features, cls_list, cls_names, img_ids)
+    logger.info('All done!!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low shot SVM model test')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features and labels")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--json_targets',
+        type=str,
+        default=None,
+        help="Numpy file containing json targets")
+    parser.add_argument(
+        '--generate_json',
+        type=int,
+        default=0,
+        help="Whether to generate json files for output")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default=
+        "0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1.0,10.0,100.0",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where trained SVM models are saved")
+    parser.add_argument(
+        '--k_values',
+        type=str,
+        default="1,2,4,8,16,32,64,96",
+        help="Low-shot k-values for svm testing. Comma separated")
+    parser.add_argument(
+        '--sample_inds',
+        type=str,
+        default="0,1,2,3,4",
+        help="sample_inds for which to test svm. Comma separated")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    test_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/train_svm_kfold.py b/benchmarks/svm_tools/train_svm_kfold.py
new file mode 100644
index 00000000..b3a7f1d2
--- /dev/null
+++ b/benchmarks/svm_tools/train_svm_kfold.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM training using 3-fold cross-validation.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from tqdm import tqdm
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score
+
+import svm_helper
+
+import time
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def train_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    #logger.info('Training SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM training should be done
+    if opts.cls_list:
+        cls_list = [int(cls) for cls in opts.cls_list.split(",")]
+    else:
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+    #logger.info('Training SVM for classes: {}'.format(cls_list))
+
+    for cls_idx in tqdm(range(len(cls_list))):
+        cls = cls_list[cls_idx]
+        for cost_idx in range(len(costs_list)):
+            start = time.time()
+            cost = costs_list[cost_idx]
+            out_file, ap_out_file = svm_helper.get_svm_train_output_files(
+                cls, cost, opts.output_path)
+            if os.path.exists(out_file) and os.path.exists(ap_out_file):
+                logger.info('SVM model exists: {}'.format(out_file))
+                logger.info('AP file exists: {}'.format(ap_out_file))
+            else:
+                #logger.info('Training model with the cost: {}'.format(cost))
+                clf = LinearSVC(
+                    C=cost,
+                    class_weight={
+                        1: 2,
+                        -1: 1
+                    },
+                    intercept_scaling=1.0,
+                    verbose=0,
+                    penalty='l2',
+                    loss='squared_hinge',
+                    tol=0.0001,
+                    dual=True,
+                    max_iter=2000,
+                )
+                cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+                # meaning of labels in VOC/COCO original loaded target files:
+                # label 0 = not present, set it to -1 as svm train target
+                # label 1 = present. Make the svm train target labels as -1, 1.
+                cls_labels[np.where(cls_labels == 0)] = -1
+                #num_positives = len(np.where(cls_labels == 1)[0])
+                #num_negatives = len(cls_labels) - num_positives
+
+                #logger.info('cls: {} has +ve: {} -ve: {} ratio: {}'.format(
+                #    cls, num_positives, num_negatives,
+                #    float(num_positives) / num_negatives)
+                #)
+                #logger.info('features: {} cls_labels: {}'.format(
+                #    features.shape, cls_labels.shape))
+                ap_scores = cross_val_score(
+                    clf,
+                    features,
+                    cls_labels,
+                    cv=3,
+                    scoring='average_precision')
+                clf.fit(features, cls_labels)
+
+                #logger.info('cls: {} cost: {} AP: {} mean:{}'.format(
+                #    cls, cost, ap_scores, ap_scores.mean()))
+                #logger.info('Saving cls cost AP to: {}'.format(ap_out_file))
+                np.save(ap_out_file, np.array([ap_scores.mean()]))
+                #logger.info('Saving SVM model to: {}'.format(out_file))
+                with open(out_file, 'wb') as fwrite:
+                    pickle.dump(clf, fwrite)
+            print("time: {:.4g} s".format(time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+
+    parser.add_argument(
+        '--cls_list',
+        type=str,
+        default=None,
+        help="comma separated string list of classes to train")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    #logger.info(opts)
+    train_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/train_svm_kfold_parallel.py b/benchmarks/svm_tools/train_svm_kfold_parallel.py
new file mode 100644
index 00000000..1ffbcb8b
--- /dev/null
+++ b/benchmarks/svm_tools/train_svm_kfold_parallel.py
@@ -0,0 +1,151 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM training using 3-fold cross-validation.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import multiprocessing as mp
+import tqdm
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score
+
+import svm_helper
+
+import pdb
+
+
+def task(cls, cost, opts, features, targets):
+    out_file, ap_out_file = svm_helper.get_svm_train_output_files(
+        cls, cost, opts.output_path)
+    if not (os.path.exists(out_file) and os.path.exists(ap_out_file)):
+        clf = LinearSVC(
+            C=cost,
+            class_weight={
+                1: 2,
+                -1: 1
+            },
+            intercept_scaling=1.0,
+            verbose=0,
+            penalty='l2',
+            loss='squared_hinge',
+            tol=0.0001,
+            dual=True,
+            max_iter=2000,
+        )
+        cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        cls_labels[np.where(cls_labels == 0)] = -1
+        ap_scores = cross_val_score(
+            clf, features, cls_labels, cv=3, scoring='average_precision')
+        clf.fit(features, cls_labels)
+        np.save(ap_out_file, np.array([ap_scores.mean()]))
+        with open(out_file, 'wb') as fwrite:
+            pickle.dump(clf, fwrite)
+    return 0
+
+
+def mp_helper(args):
+    return task(*args)
+
+
+def train_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+
+    # classes for which SVM training should be done
+    if opts.cls_list:
+        cls_list = [int(cls) for cls in opts.cls_list.split(",")]
+    else:
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+
+    num_task = len(cls_list) * len(costs_list)
+    args_cls = []
+    args_cost = []
+    for cls in cls_list:
+        for cost in costs_list:
+            args_cls.append(cls)
+            args_cost.append(cost)
+    args_opts = [opts] * num_task
+    args_features = [features] * num_task
+    args_targets = [targets] * num_task
+
+    pool = mp.Pool(mp.cpu_count())
+    for _ in tqdm.tqdm(
+            pool.imap_unordered(
+                mp_helper,
+                zip(args_cls, args_cost, args_opts, args_features,
+                    args_targets)),
+            total=num_task):
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+
+    parser.add_argument(
+        '--cls_list',
+        type=str,
+        default=None,
+        help="comma separated string list of classes to train")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    train_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/train_svm_low_shot.py b/benchmarks/svm_tools/train_svm_low_shot.py
new file mode 100644
index 00000000..b5a0fbb2
--- /dev/null
+++ b/benchmarks/svm_tools/train_svm_low_shot.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Low Shot SVM training.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+from tqdm import tqdm
+
+import svm_helper
+
+import time
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def train_svm_low_shot(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    #logger.info('Training SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    for cls in tqdm(cls_list):
+        for cost_idx in range(len(costs_list)):
+            start = time.time()
+            cost = costs_list[cost_idx]
+            suffix = '_'.join(
+                opts.targets_data_file.split('/')[-1].split('.')[0].split('_')
+                [-2:])
+            out_file = svm_helper.get_low_shot_output_file(
+                opts, cls, cost, suffix)
+            if os.path.exists(out_file):
+                logger.info('SVM model exists: {}'.format(out_file))
+            else:
+                #logger.info('SVM model not found: {}'.format(out_file))
+                #logger.info('Training model with the cost: {}'.format(cost))
+                clf = LinearSVC(
+                    C=cost,
+                    class_weight={
+                        1: 2,
+                        -1: 1
+                    },
+                    intercept_scaling=1.0,
+                    verbose=0,
+                    penalty='l2',
+                    loss='squared_hinge',
+                    tol=0.0001,
+                    dual=True,
+                    max_iter=2000,
+                )
+                train_feats, train_cls_labels = svm_helper.get_cls_feats_labels(
+                    cls, features, targets, opts.dataset)
+                #num_positives = len(np.where(train_cls_labels == 1)[0])
+                #num_negatives = len(np.where(train_cls_labels == -1)[0])
+
+                #logger.info('cls: {} has +ve: {} -ve: {} ratio: {}'.format(
+                #    cls, num_positives, num_negatives,
+                #    float(num_positives) / num_negatives)
+                #)
+                #logger.info('features: {} cls_labels: {}'.format(
+                #    train_feats.shape, train_cls_labels.shape))
+                clf.fit(train_feats, train_cls_labels)
+                #logger.info('Saving SVM model to: {}'.format(out_file))
+                with open(out_file, 'wb') as fwrite:
+                    pickle.dump(clf, fwrite)
+            #print("time: {:.4g} s".format(time.time() - start))
+    #logger.info('All done!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low-shot SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+
+    #logger.info(opts)
+    train_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/svm_tools/train_svm_low_shot_parallel.py b/benchmarks/svm_tools/train_svm_low_shot_parallel.py
new file mode 100644
index 00000000..f3a0843d
--- /dev/null
+++ b/benchmarks/svm_tools/train_svm_low_shot_parallel.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Low Shot SVM training.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import multiprocessing as mp
+import tqdm
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+
+import svm_helper
+
+import pdb
+
+
+def task(cls, cost, opts, features, targets):
+    suffix = '_'.join(
+        opts.targets_data_file.split('/')[-1].split('.')[0].split('_')[-2:])
+    out_file = svm_helper.get_low_shot_output_file(opts, cls, cost, suffix)
+    if not os.path.exists(out_file):
+        clf = LinearSVC(
+            C=cost,
+            class_weight={
+                1: 2,
+                -1: 1
+            },
+            intercept_scaling=1.0,
+            verbose=0,
+            penalty='l2',
+            loss='squared_hinge',
+            tol=0.0001,
+            dual=True,
+            max_iter=2000,
+        )
+        train_feats, train_cls_labels = svm_helper.get_cls_feats_labels(
+            cls, features, targets, opts.dataset)
+        clf.fit(train_feats, train_cls_labels)
+        #cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        #cls_labels[np.where(cls_labels == 0)] = -1
+        #clf.fit(features, cls_labels)
+        with open(out_file, 'wb') as fwrite:
+            pickle.dump(clf, fwrite)
+    return 0
+
+
+def mp_helper(args):
+    return task(*args)
+
+
+def train_svm_low_shot(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    num_task = len(cls_list) * len(costs_list)
+    args_cls = []
+    args_cost = []
+    for cls in cls_list:
+        for cost in costs_list:
+            args_cls.append(cls)
+            args_cost.append(cost)
+    args_opts = [opts] * num_task
+    args_features = [features] * num_task
+    args_targets = [targets] * num_task
+
+    pool = mp.Pool(mp.cpu_count())
+    for _ in tqdm.tqdm(
+            pool.imap_unordered(
+                mp_helper,
+                zip(args_cls, args_cost, args_opts, args_features,
+                    args_targets)),
+            total=num_task):
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low-shot SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    train_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/configs/base.py b/configs/base.py
new file mode 100644
index 00000000..f0695d52
--- /dev/null
+++ b/configs/base.py
@@ -0,0 +1,18 @@
+train_cfg = {}
+test_cfg = {}
+optimizer_config = dict()  # grad_clip, coalesce, bucket_size_mb
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+dist_params = dict(backend='nccl')
+cudnn_benchmark = True
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/classification/cifar10/r50.py b/configs/classification/cifar10/r50.py
new file mode 100644
index 00000000..ff2fb8a9
--- /dev/null
+++ b/configs/classification/cifar10/r50.py
@@ -0,0 +1,59 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 4: stage-4
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=10))
+# dataset settings
+data_source_cfg = dict(type='Cifar10', root='data/cifar/')
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.201])
+train_pipeline = [
+    dict(type='RandomCrop', size=32, padding=4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=128,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(split='train', **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(split='test', **data_source_cfg),
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_source=dict(split='test', **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=8,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005)
+# learning policy
+lr_config = dict(policy='step', step=[150, 250])
+checkpoint_config = dict(interval=50)
+# runtime settings
+total_epochs = 350
diff --git a/configs/classification/imagnet/r50.py b/configs/classification/imagnet/r50.py
new file mode 100644
index 00000000..6425dbf7
--- /dev/null
+++ b/configs/classification/imagnet/r50.py
@@ -0,0 +1,68 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
diff --git a/configs/linear_classification/imagenet/r50_multihead.py b/configs/linear_classification/imagenet/r50_multihead.py
new file mode 100644
index 00000000..05b8f168
--- /dev/null
+++ b/configs/linear_classification/imagenet/r50_multihead.py
@@ -0,0 +1,89 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    frozen_backbone=True,
+    with_sobel=False,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[0, 1, 2, 3, 4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='MultiClsHead',
+        pool_type='specified',
+        in_indices=[0, 1, 2, 3, 4],
+        with_last_layer_unpool=True,
+        backbone='resnet50',
+        norm_cfg=dict(type='BN', momentum=0.1, affine=False),
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.),
+    dict(type='ToTensor'),
+    dict(type='Lighting'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=256,  # total 256
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=4,
+        eval_param=dict(topk=(1, )))
+]
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_options=dict(norm_decay_mult=0.),
+    nesterov=True)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
diff --git a/configs/linear_classification/places205/r50_multihead.py b/configs/linear_classification/places205/r50_multihead.py
new file mode 100644
index 00000000..135858d0
--- /dev/null
+++ b/configs/linear_classification/places205/r50_multihead.py
@@ -0,0 +1,89 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    frozen_backbone=True,
+    with_sobel=False,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[0, 1, 2, 3, 4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='MultiClsHead',
+        pool_type='specified',
+        in_indices=[0, 1, 2, 3, 4],
+        with_last_layer_unpool=True,
+        backbone='resnet50',
+        norm_cfg=dict(type='BN', momentum=0.1, affine=False),
+        num_classes=205))
+# dataset settings
+data_source_cfg = dict(
+    type='Places205',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/places205/meta/train_labeled.txt'
+data_train_root = 'data/places205/train'
+data_test_list = 'data/places205/meta/val_labeled.txt'
+data_test_root = 'data/places205/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.),
+    dict(type='ToTensor'),
+    dict(type='Lighting'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=256,  # total 256
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=4,
+        eval_param=dict(topk=(1, )))
+]
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_options=dict(norm_decay_mult=0.),
+    nesterov=True)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
diff --git a/configs/selfsup/deepcluster/r50.py b/configs/selfsup/deepcluster/r50.py
new file mode 100644
index 00000000..63d5f301
--- /dev/null
+++ b/configs/selfsup/deepcluster/r50.py
@@ -0,0 +1,88 @@
+_base_ = '../../base.py'
+# model settings
+num_classes = 10000
+model = dict(
+    type='DeepCluster',
+    pretrained=None,
+    with_sobel=True,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=2,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(type='AvgPoolNeck'),
+    head=dict(
+        type='ClsHead',
+        with_avg_pool=False,  # already has avgpool in the neck
+        in_channels=2048,
+        num_classes=num_classes))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'DeepClusterDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='RandomRotation', degrees=2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=1.0,
+        hue=0.5),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+extract_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=64,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='DeepClusterHook',
+        extractor=dict(
+            imgs_per_gpu=128,
+            workers_per_gpu=8,
+            dataset=dict(
+                type=dataset_type,
+                data_source=dict(
+                    list_file=data_train_list,
+                    root=data_train_root,
+                    **data_source_cfg),
+                pipeline=extract_pipeline)),
+        clustering=dict(type='Kmeans', k=num_classes, pca_dim=256),
+        unif_sampling=True,
+        reweight=False,
+        reweight_pow=0.5,
+        initial=True,  # call initially
+        interval=1)
+]
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001,
+    nesterov=False,
+    paramwise_options={'\Ahead.': dict(momentum=0.)})
+# learning policy
+lr_config = dict(policy='step', step=[400])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 480
diff --git a/configs/selfsup/moco/r50_v1.py b/configs/selfsup/moco/r50_v1.py
new file mode 100644
index 00000000..84de7c88
--- /dev/null
+++ b/configs/selfsup/moco/r50_v1.py
@@ -0,0 +1,59 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='MOCO',
+    pretrained=None,
+    queue_len=65536,
+    feat_dim=128,
+    momentum=0.999,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(
+        type='LinearNeck',
+        in_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.07))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8=256
+    workers_per_gpu=4,
+    drop_last=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9)
+# learning policy
+lr_config = dict(policy='step', step=[120, 160])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
diff --git a/configs/selfsup/moco/r50_v2.py b/configs/selfsup/moco/r50_v2.py
new file mode 100644
index 00000000..8c2bd63f
--- /dev/null
+++ b/configs/selfsup/moco/r50_v2.py
@@ -0,0 +1,75 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='MOCO',
+    pretrained=None,
+    queue_len=65536,
+    feat_dim=128,
+    momentum=0.999,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.2))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.4,
+                hue=0.4)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8=256
+    workers_per_gpu=4,
+    drop_last=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9)
+# learning policy
+lr_config = dict(policy='CosineAnealing', min_lr=0.)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
diff --git a/configs/selfsup/npid/r50.py b/configs/selfsup/npid/r50.py
new file mode 100644
index 00000000..f1b17088
--- /dev/null
+++ b/configs/selfsup/npid/r50.py
@@ -0,0 +1,64 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='NPID',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='LinearNeck',
+        in_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.07),
+    memory_bank=dict(
+        type='SimpleMemory', length=1281167, feat_dim=128, momentum=0.5))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'NPIDDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9, nesterov=False)
+# learning policy
+lr_config = dict(policy='step', step=[120, 160])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
diff --git a/configs/selfsup/rotation_pred/r50.py b/configs/selfsup/rotation_pred/r50.py
new file mode 100644
index 00000000..cb34e48c
--- /dev/null
+++ b/configs/selfsup/rotation_pred/r50.py
@@ -0,0 +1,64 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='RotationPred',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=4))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'RotationPredDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=16,  # (16*4) x 8 = 512
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0001, nesterov=False)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[30, 50],
+    warmup='linear',
+    warmup_iters=5,  # 5 ep
+    warmup_ratio=0.1,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 70
diff --git a/configs/selfsup/simclr/r50_bs256.py b/configs/selfsup/simclr/r50_bs256.py
new file mode 100644
index 00000000..cb087730
--- /dev/null
+++ b/configs/selfsup/simclr/r50_bs256.py
@@ -0,0 +1,77 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='SimCLR',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.1))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.8,
+                contrast=0.8,
+                saturation=0.8,
+                hue=0.2)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='LARS', lr=0.3, weight_decay=0.000001, momentum=0.9)
+# learning policy
+lr_config = dict(
+    policy='CosineAnealing',
+    min_lr=0.,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
diff --git a/configs/selfsup/simclr/r50_bs512.py b/configs/selfsup/simclr/r50_bs512.py
new file mode 100644
index 00000000..110f6f63
--- /dev/null
+++ b/configs/selfsup/simclr/r50_bs512.py
@@ -0,0 +1,77 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='SimCLR',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.1))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.8,
+                contrast=0.8,
+                saturation=0.8,
+                hue=0.2)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=64,  # total 64*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='LARS', lr=0.6, weight_decay=0.000001, momentum=0.9)
+# learning policy
+lr_config = dict(
+    policy='CosineAnealing',
+    min_lr=0.,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
diff --git a/configs/semisup_classification/imagenet_10percent/r50.py b/configs/semisup_classification/imagenet_10percent/r50.py
new file mode 100644
index 00000000..2313c322
--- /dev/null
+++ b/configs/semisup_classification/imagenet_10percent/r50.py
@@ -0,0 +1,69 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled_10percent.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=2,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
+                 paramwise_options={'\Ahead.': dict(lr_mult=10)})
+# learning policy
+lr_config = dict(policy='step', step=[18, 24], gamma=0.2)
+checkpoint_config = dict(interval=2)
+# runtime settings
+total_epochs = 30
diff --git a/configs/semisup_classification/imagenet_1percent/r50.py b/configs/semisup_classification/imagenet_1percent/r50.py
new file mode 100644
index 00000000..ab5de513
--- /dev/null
+++ b/configs/semisup_classification/imagenet_1percent/r50.py
@@ -0,0 +1,69 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled_1percent.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=2,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
+                 paramwise_options={'\Ahead.': dict(lr_mult=100)})
+# learning policy
+lr_config = dict(policy='step', step=[12, 16], gamma=0.2)
+checkpoint_config = dict(interval=2)
+# runtime settings
+total_epochs = 20
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
new file mode 100644
index 00000000..76877891
--- /dev/null
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,2 @@
+## Changelog
+
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
new file mode 100644
index 00000000..c13a479a
--- /dev/null
+++ b/docs/GETTING_STARTED.md
@@ -0,0 +1,192 @@
+# Getting Started
+
+This page provides basic tutorials about the usage of OpenSelfSup.
+For installation instructions, please see [INSTALL.md](INSTALL.md).
+
+## Train existing methods
+
+**Note**: The default learning rate in config files is for 8 GPUs (except for those under `configs/linear_classification` that use 1 GPU). If using differnt number GPUs, the total batch size will change in proportion, you have to scale the learning rate following `new_lr = old_lr * new_ngpus / old_ngpus`. We recommend to use `tools/dist_train.sh` even with 1 gpu, since some methods do not support non-distributed training.
+
+### Train with single/multiple GPUs
+```shell
+# checkpoints and logs are saved in the same sub-directory as the config file under `work_dirs/` by default.
+bash tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+An example:
+```shell
+bash tools/dist_train.sh configs/selfsup/odc/r50_v1.py 8
+```
+
+Optional arguments are:
+- `--work_dir ${WORK_DIR}`: Override the default working directory.
+- `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file.
+- `--pretrained ${PRETRAIN_WEIGHTS}`: Load pretrained weights for the backbone.
+
+Alternatively, if you run OpenSelfSup on a cluster managed with [slurm](https://slurm.schedmd.com/):
+```shell
+SRUN_ARGS="${SRUN_ARGS}" bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+An example:
+```shell
+SRUN_ARGS="-w xx.xx.xx.xx" bash tools/srun_train.sh Dummy configs/selfsup/odc/r50_v1.py 8
+```
+
+### Launch multiple jobs on a single machine
+
+If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+If you use `dist_train.sh` to launch training jobs:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+If you use launch training jobs with slurm:
+```shell
+GPUS_PER_NODE=4 bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} 4 --port 29500
+GPUS_PER_NODE=4 bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} 4 --port 29501
+```
+
+## Benchmarks
+
+We provide several standard benchmarks to evaluate representation learning.
+
+### VOC07 Linear SVM & Low-shot Linear SVM
+
+```shell
+bash benchmarks/dist_test_svm.sh ${CONFIG_FILE} ${EPOCH} ${FEAT_LIST} ${GPU_NUM}
+```
+Augments:
+- `${FEAT_LIST}` is a string to specify features from layer1 to layer5 to evaluate; e.g., if you want to evaluate layer5 only, then `FEAT_LIST` is `feat5`, if you want to evaluate all features, then then `FEAT_LIST` is `feat1 feat2 feat3 feat4 feat5` (separated by space).
+- `$GPU_NUM` is the number of GPUs to extract features.
+
+### ImageNet / Places205 Linear Classification
+
+```shell
+bash benchmarks/dist_test_cls.sh ${CONFIG_FILE} ${EPOCH} ${DATASET} [optional arguments]
+```
+Augments:
+- `${DATASET}` in `['imagenet', 'places205']`.
+- Optional arguments include `--resume_from ${CHECKPOINT_FILE}` that resume from a previous checkpoint file.
+
+### VOC07+12 / COCO17 Object Detection
+
+1. First, extract backbone weights:
+
+    ```shell
+    python tools/extract_backbone_weights.py ${CHECKPOINT} --save-path ${WEIGHT_FILE}
+    ```
+    Arguments:
+    - `CHECKPOINTS`: the checkpoint file of a selfsup method named as `epoch_*.pth`.
+    - `WEIGHT_FILE`: the output backbone weights file, e.g., `odc_v1.pth`.
+    
+2. Next, run detection. For more details to setup the environments for detection, please refer [here](benchmarks/detection/README.md).
+```shell
+conda activate detectron2
+cd benchmarks/detection
+python convert-pretrain-to-detectron2.py ${WEIGHT_FILE} ${OUTPUT_FILE} # must use .pkl as the output extension.
+bash run.sh ${DET_CFG} ${OUTPUT_FILE}
+```
+Arguments:
+- `DET_CFG`: the detectron2 config file, usually we use `configs/pascal_voc_R_50_C4_24k_moco.yaml`.
+- `OUTPUT_FILE`: converted backbone weights file, e.g., `odc_v1.pkl`.
+
+**Note**:
+- This benchmark must use 8 GPUs as the default setting from MoCo.
+- Please report the mean of 5 trials in your offical paper, according to MoCo.
+- DeepCluster that uses Sobel layer is not supported by detectron2.
+
+### Publish a model
+
+1. Extract the backbone weights as mentioned before. You don't have to extract it again if you've already done it in the benchmark step.
+
+```shell
+python tools/extract_backbone_weights.py ${CHECKPOINT} --save-path ${WEIGHT_FILE}
+```
+
+2. Compute the hash of the weight file and append the hash id to the filename.
+
+```shell
+python tools/publish_model.py ${WEIGHT_FILE}
+```
+
+## How-to
+
+### Use a new dataset
+
+1. Write a data source file under `openselfsup/datasets/data_sources/`. You may refer to the existing ones.
+
+2. Create new config files for your experiments.
+
+### Design your own methods
+
+#### What you need to do
+
+    1. Create a dataset file under `openselfsup/datasets/` (better using existing ones);
+    2. Create a model file under `openselfsup/models/`. The model typically contains:
+      i) backbone (required): images to deep features from differet depth of layers.
+      ii) neck (optional): deep features to compact feature vectors.
+      iii) head (optional): define loss functions.
+      iv) memory_bank (optional): define memory banks.
+    3. Create a config file under `configs/` and setup the configs;
+    4. Create a hook file under `openselfsup/hooks/` if your method requires additional operations before run, every several iterations, every several epoch, or after run.
+    
+You may refer to existing modules under respective folders.
+
+#### Features may facilitate your implementation
+
+* Decoupled data source and dataset.
+
+Since dataset is correlated to a specific task while data source is general, we decouple data source and dataset in OpenSelfSup.
+
+```python
+data = dict(
+    train=dict(type='ContrastiveDataset',
+               data_source=dict(type='ImageNet', list_file='xx', root='xx'),
+               pipeline=train_pipeline),
+    val=dict(...),
+)
+```
+
+* Configure data augmentations in the config file.
+
+The augmentations are the same as `torchvision.transforms`. `torchvision.transforms.RandomAppy` corresponds to `RandomAppliedTrans`. `Lighting` and `GaussianBlur` is additionally implemented.
+
+```python
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomAppliedTrans',
+        transforms=[
+            dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, kernel_size=23)],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg)
+]
+```
+
+* Parameter-wise optimization parameters.
+
+You may specify optimization paramters including lr, momentum and weight_decay for a certain group of paramters in the config file with `paramwise_options`. `paramwise_options` is a dict whose key is regular expressions and value is options. Options include 6 fields: lr, lr_mult, momentum, momentum_mult, weight_decay, weight_decay_mult.
+
+```python
+paramwise_options = {
+    '(bn|gn)(\d+)?.(weight|bias)': dict(weight_decay_mult=0.1),
+    '\Ahead.': dict(lr_mult=10, momentum=0)}
+optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+                     weight_decay=0.0001,
+                     paramwise_options=paramwise_options)
+```
+
+* Configure custom hooks in the config file.
+
+The hooks will be called in order. For hook design, please refer to [odc_hook.py](openselfsup/hooks/odc_hook.py) as an example.
+
+```python
+custom_hooks = [
+    dict(type='DeepClusterHook', **kwargs1),
+    dict(type='ODCHook', **kwargs2),
+]
+```
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
new file mode 100644
index 00000000..f604b733
--- /dev/null
+++ b/docs/INSTALL.md
@@ -0,0 +1,146 @@
+## Installation
+
+### Requirements
+
+- Linux (Windows is not officially supported)
+- Python 3.5+
+- PyTorch 1.1 or higher
+- CUDA 9.0 or higher
+- NCCL 2
+- GCC 4.9 or higher
+- [mmcv](https://github.com/open-mmlab/mmcv)
+
+We have tested the following versions of OS and softwares:
+
+- OS: Ubuntu 16.04/18.04 and CentOS 7.2
+- CUDA: 9.0/9.2/10.0/10.1
+- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2
+- GCC(G++): 4.9/5.3/5.4/7.3
+
+### Install openselfsup
+
+a. Create a conda virtual environment and activate it.
+
+```shell
+conda create -n open-mmlab python=3.7 -y
+conda activate open-mmlab
+```
+
+b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/), e.g.,
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+c. Install other third-party libraries.
+
+```shell
+conda install faiss-gpu cudatoolkit=10.0 -c pytorch # optional for DeepCluster and ODC, assuming CUDA=10.0
+```
+
+d. Clone the openselfsup repository.
+
+```shell
+git clone https://github.com/open-mmlab/openselfsup.git
+cd openselfsup
+```
+
+e. Install.
+
+```shell
+pip install -v -e .  # or "python setup.py develop"
+```
+
+Note:
+
+1. The git commit id will be written to the version number with step d, e.g. 0.6.0+2e7045c. The version will also be saved in trained models.
+
+2. Following the above instructions, openselfsup is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number).
+
+3. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+you can install it before installing MMCV.
+
+4. Some dependencies are optional. Simply running `pip install -v -e .` will only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
+
+
+### Prepare datasets
+
+It is recommended to symlink your dataset root (assuming $YOUR_DATA_ROOT) to `$OPENSELFSUP/data`.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+
+#### Prepare PASCAL VOC
+
+Assuming that you usually store datasets in `$YOUR_DATA_ROOT` (e.g., for me, `/home/xhzhan/data/`).
+This script will automatically download PASCAL VOC 2007 into `$YOUR_DATA_ROOT`, prepare the required files, create a folder `data` under `$OPENSELFSUP` and make a symlink `VOCdevkit`.
+
+```shell
+cd $OPENSELFSUP
+bash tools/prepare_data/prepare_voc07_cls.sh $YOUR_DATA_ROOT
+```
+
+#### Prepare ImageNet and Places205
+
+Taking ImageNet for example,y ou need to 1) download ImageNet; 2) create list files under $IAMGENET/meta/, `train.txt` contains an image file name in each line, `train_labeled.txt` contains `filename[space]label\n` in each line; 3) create a symlink under `$OPENSELFSUP/data/`.
+
+At last, the folder looks like:
+
+```
+OpenSelfSup
+├── openselfsup
+├── benchmarks
+├── configs
+├── data
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+│   ├── imagenet
+│   │   ├── meta
+│   │   |   ├── train.txt ("filename\n" in each line)
+│   │   |   ├── train_labeled.txt ("filename[space]label\n" in each line)
+│   │   |   ├── val.txt
+│   │   |   ├── val_labeled.txt
+│   │   ├── train
+│   │   ├── val
+│   ├── places
+│   │   ├── meta
+│   │   |   ├── train.txt
+│   │   |   ├── train_labeled.txt
+│   │   |   ├── val.txt
+│   │   |   ├── val_labeled.txt
+│   │   ├── train
+│   │   ├── val
+```
+
+### A from-scratch setup script
+
+Here is a full script for setting up openselfsup with conda and link the dataset path.
+
+```shell
+conda create -n open-mmlab python=3.7 -y
+conda activate open-mmlab
+
+conda install -c pytorch pytorch torchvision -y
+git clone https://github.com/open-mmlab/OpenSelfSup.git
+cd OpenSelfSup
+pip install -v -e .
+
+bash tools/prepare_data/prepare_voc07_cls.sh $YOUR_DATA_ROOT
+ln -s $IMAGENET_ROOT data
+ln -s $PLACES_ROOT data
+```
+
+### Using multiple OpenSelfSup versions
+
+If there are more than one openselfsup on your machine, and you want to use them alternatively, the recommended way is to create multiple conda environments and use different environments for different versions.
+
+Another way is to insert the following code to the main scripts (`train.py`, `test.py` or any other scripts you run)
+```python
+import os.path as osp
+import sys
+sys.path.insert(0, osp.join(osp.dirname(osp.abspath(__file__)), '../'))
+```
+
+Or run the following command in the terminal of corresponding folder to temporally use the current one.
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
diff --git a/docs/MODEL_ZOO.md b/docs/MODEL_ZOO.md
new file mode 100644
index 00000000..1cb01e52
--- /dev/null
+++ b/docs/MODEL_ZOO.md
@@ -0,0 +1 @@
+#Model Zoo
diff --git a/docs/relation.jpg b/docs/relation.jpg
new file mode 100644
index 00000000..db66ab83
Binary files /dev/null and b/docs/relation.jpg differ
diff --git a/openselfsup/__init__.py b/openselfsup/__init__.py
new file mode 100644
index 00000000..1c4f7e8f
--- /dev/null
+++ b/openselfsup/__init__.py
@@ -0,0 +1,3 @@
+from .version import __version__, short_version
+
+__all__ = ['__version__', 'short_version']
diff --git a/openselfsup/apis/__init__.py b/openselfsup/apis/__init__.py
new file mode 100644
index 00000000..1d734787
--- /dev/null
+++ b/openselfsup/apis/__init__.py
@@ -0,0 +1 @@
+from .train import get_root_logger, set_random_seed, train_model
diff --git a/openselfsup/apis/train.py b/openselfsup/apis/train.py
new file mode 100644
index 00000000..eee1f0ca
--- /dev/null
+++ b/openselfsup/apis/train.py
@@ -0,0 +1,275 @@
+import random
+import re
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import DistSamplerSeedHook, Runner, obj_from_dict
+
+from openselfsup.datasets import build_dataloader
+from openselfsup.hooks import build_hook, DistOptimizerHook
+from openselfsup.utils import get_root_logger, optimizers, print_log
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def parse_losses(losses):
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(
+                '{} is not a tensor or list of tensors'.format(loss_name))
+
+    loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
+
+    log_vars['loss'] = loss
+    for loss_name, loss_value in log_vars.items():
+        # reduce loss when distributed training
+        if dist.is_available() and dist.is_initialized():
+            loss_value = loss_value.data.clone()
+            dist.all_reduce(loss_value.div_(dist.get_world_size()))
+        log_vars[loss_name] = loss_value.item()
+
+    return loss, log_vars
+
+
+def batch_processor(model, data, train_mode):
+    """Process a data batch.
+
+    This method is required as an argument of Runner, which defines how to
+    process a data batch and obtain proper outputs. The first 3 arguments of
+    batch_processor are fixed.
+
+    Args:
+        model (nn.Module): A PyTorch model.
+        data (dict): The data batch in a dict.
+        train_mode (bool): Training mode or not. It may be useless for some
+            models.
+
+    Returns:
+        dict: A dict containing losses and log vars.
+    """
+    assert model.training, "Must be in training mode."
+    losses = model(**data)
+    loss, log_vars = parse_losses(losses)
+
+    outputs = dict(
+        loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+    return outputs
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                timestamp=None,
+                meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # start training
+    if distributed:
+        _dist_train(
+            model, dataset, cfg, logger=logger, timestamp=timestamp, meta=meta)
+    else:
+        _non_dist_train(
+            model, dataset, cfg, logger=logger, timestamp=timestamp, meta=meta)
+
+
+def build_optimizer(model, optimizer_cfg):
+    """Build optimizer from configs.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are:
+                - type: class name of the optimizer.
+                - lr: base learning rate.
+            Optional fields are:
+                - any arguments of the corresponding optimizer type, e.g.,
+                  weight_decay, momentum, etc.
+                - paramwise_options: a dict with regular expression as keys
+                  to match parameter names and a dict containing options as
+                  values. Options include 6 fields: lr, lr_mult, momentum,
+                  momentum_mult, weight_decay, weight_decay_mult.
+
+    Returns:
+        torch.optim.Optimizer: The initialized optimizer.
+
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> paramwise_options = {
+        >>>     '(bn|gn)(\d+)?.(weight|bias)': dict(weight_decay_mult=0.1),
+        >>>     '\Ahead.': dict(lr_mult=10, momentum=0)}
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001,
+        >>>                      paramwise_options=paramwise_options)
+        >>> optimizer = build_optimizer(model, optimizer_cfg)
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    optimizer_cfg = optimizer_cfg.copy()
+    paramwise_options = optimizer_cfg.pop('paramwise_options', None)
+    # if no paramwise option is specified, just use the global setting
+    if paramwise_options is None:
+        return obj_from_dict(optimizer_cfg, optimizers,
+                             dict(params=model.parameters()))
+    else:
+        assert isinstance(paramwise_options, dict)
+        params = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            for regexp, options in paramwise_options.items():
+                if re.search(regexp, name):
+                    for key, value in options.items():
+                        if key.endswith('_mult'): # is a multiplier
+                            key = key[:-5]
+                            assert key in optimizer_cfg, \
+                                "{} not in optimizer_cfg".format(key)
+                            value = optimizer_cfg[key] * value
+                        param_group[key] = value
+                        if not dist.is_initialized() or dist.get_rank() == 0:
+                            print_log('paramwise_options -- {}: {}={}'.format(
+                                name, key, value))
+
+            # otherwise use the global settings
+            params.append(param_group)
+
+        optimizer_cls = getattr(optimizers, optimizer_cfg.pop('type'))
+        return optimizer_cls(params, **optimizer_cfg)
+
+
+def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            dist=True,
+            shuffle=True,
+            replace=getattr(cfg.data, 'sampling_replace', False),
+            seed=cfg.seed,
+            drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset
+    ]
+    # put model on gpus
+    model = MMDistributedDataParallel(
+        model.cuda(),
+        device_ids=[torch.cuda.current_device()],
+        broadcast_buffers=False)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+    runner.register_hook(DistSamplerSeedHook())
+    # register custom hooks
+    for hook in cfg.get('custom_hooks', ()):
+        if hook.type == 'DeepClusterHook':
+            common_params = dict(dist_mode=True, data_loaders=data_loaders)
+        else:
+            common_params = dict(dist_mode=True)
+        runner.register_hook(build_hook(hook, common_params))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
+
+
+def _non_dist_train(model,
+                    dataset,
+                    cfg,
+                    validate=False,
+                    logger=None,
+                    timestamp=None,
+                    meta=None):
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            cfg.gpus,
+            dist=False,
+            shuffle=True,
+            replace=getattr(cfg.data, 'sampling_replace', False),
+            seed=cfg.seed,
+            drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset
+    ]
+    # put model on gpus
+    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+    optimizer_config = cfg.optimizer_config
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+
+    # register custom hooks
+    for hook in cfg.get('custom_hooks', ()):
+        if hook.type == 'DeepClusterHook':
+            common_params = dict(dist_mode=False, data_loaders=data_loaders)
+        else:
+            common_params = dict(dist_mode=False)
+        runner.register_hook(build_hook(hook, common_params))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/openselfsup/datasets/__init__.py b/openselfsup/datasets/__init__.py
new file mode 100644
index 00000000..11e81bbe
--- /dev/null
+++ b/openselfsup/datasets/__init__.py
@@ -0,0 +1,12 @@
+from .builder import build_dataset
+from .data_sources import *
+from .pipelines import *
+from .classification import ClassificationDataset
+from .deepcluster import DeepClusterDataset
+from .extraction import ExtractDataset
+from .npid import NPIDDataset
+from .rotation_pred import RotationPredDataset
+from .contrastive import ContrastiveDataset
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
+from .registry import DATASETS
diff --git a/openselfsup/datasets/base.py b/openselfsup/datasets/base.py
new file mode 100644
index 00000000..02dc75e9
--- /dev/null
+++ b/openselfsup/datasets/base.py
@@ -0,0 +1,32 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+from torch.utils.data import Dataset
+
+from openselfsup.utils import print_log, build_from_cfg
+
+from torchvision.transforms import Compose
+
+from .registry import DATASETS, PIPELINES
+from .builder import build_datasource
+
+
+class BaseDataset(Dataset, metaclass=ABCMeta):
+    """Base Dataset
+    """
+
+    def __init__(self, data_source, pipeline):
+        self.data_source = build_datasource(data_source)
+        pipeline = [build_from_cfg(p, PIPELINES) for p in pipeline]
+        self.pipeline = Compose(pipeline)
+
+    def __len__(self):
+        return self.data_source.get_length()
+
+    @abstractmethod
+    def __getitem__(self, idx):
+        pass
+
+    @abstractmethod
+    def evaluate(self, scores, keyword, logger=None, **kwargs):
+        pass
diff --git a/openselfsup/datasets/builder.py b/openselfsup/datasets/builder.py
new file mode 100644
index 00000000..a7a40325
--- /dev/null
+++ b/openselfsup/datasets/builder.py
@@ -0,0 +1,43 @@
+import copy
+
+from openselfsup.utils import build_from_cfg
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .registry import DATASETS, DATASOURCES
+
+
+def _concat_dataset(cfg, default_args=None):
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets)
+
+
+def build_dataset(cfg, default_args=None):
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_datasource(cfg):
+    return build_from_cfg(cfg, DATASOURCES)
diff --git a/openselfsup/datasets/classification.py b/openselfsup/datasets/classification.py
new file mode 100644
index 00000000..584cd952
--- /dev/null
+++ b/openselfsup/datasets/classification.py
@@ -0,0 +1,43 @@
+import torch
+
+from openselfsup.utils import print_log
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ClassificationDataset(BaseDataset):
+    """Dataset for classification
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ClassificationDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, target = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img, gt_label=target)
+
+    def evaluate(self, scores, keyword, logger=None, topk=(1, 5)):
+        '''results: Tensor (NxC)
+        '''
+        eval_res = {}
+
+        target = torch.LongTensor(self.data_source.labels)
+        assert scores.size(0) == target.size(0), \
+            "Inconsistent length for results and labels, {} vs {}".format(
+            scores.size(0), target.size(0))
+        num = scores.size(0)
+        _, pred = scores.topk(max(topk), dim=1, largest=True, sorted=True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))  # KxN
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0).item()
+            acc = correct_k * 100.0 / num
+            eval_res["{}_acc@{}".format(keyword, k)] = acc
+            if logger is not None and logger != 'silent':
+                print_log(
+                    "{}_acc@{}: {:.03f}".format(keyword, k, acc),
+                    logger=logger)
+        return eval_res
diff --git a/openselfsup/datasets/contrastive.py b/openselfsup/datasets/contrastive.py
new file mode 100644
index 00000000..bf42fef0
--- /dev/null
+++ b/openselfsup/datasets/contrastive.py
@@ -0,0 +1,23 @@
+import torch
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ContrastiveDataset(BaseDataset):
+    """Dataset for rotation prediction 
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ContrastiveDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img1 = self.pipeline(img)
+        img2 = self.pipeline(img)
+        img_cat = torch.cat((img1.unsqueeze(0), img2.unsqueeze(0)), dim=0)
+        return dict(img=img_cat)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
diff --git a/openselfsup/datasets/data_sources/__init__.py b/openselfsup/datasets/data_sources/__init__.py
new file mode 100644
index 00000000..25a66682
--- /dev/null
+++ b/openselfsup/datasets/data_sources/__init__.py
@@ -0,0 +1,3 @@
+from .cifar import Cifar10, Cifar100
+from .image_list import ImageList
+from .imagenet import ImageNet
diff --git a/openselfsup/datasets/data_sources/cifar.py b/openselfsup/datasets/data_sources/cifar.py
new file mode 100644
index 00000000..d04c49e5
--- /dev/null
+++ b/openselfsup/datasets/data_sources/cifar.py
@@ -0,0 +1,55 @@
+from PIL import Image
+
+from torchvision.datasets import CIFAR10, CIFAR100
+
+from ..registry import DATASOURCES
+
+
+@DATASOURCES.register_module
+class Cifar10(object):
+
+    CLASSES = [
+        'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
+        'horse', 'ship', 'truck'
+    ]
+
+    def __init__(self, root, split):
+        assert split in ['train', 'test']
+        try:
+            self.cifar = CIFAR10(
+                root=root, train=split == 'train', download=False)
+        except:
+            raise Exception("Please download CIFAR10 manually, \
+                  in case of downloading the dataset parallelly \
+                  that may corrupt the dataset.")
+        self.labels = self.cifar.targets
+
+    def get_length(self):
+        return len(self.cifar)
+
+    def get_sample(self, idx):
+        img = Image.fromarray(self.cifar.data[idx])
+        target = self.labels[idx]  # img: HWC, RGB
+        return img, target
+
+
+@DATASOURCES.register_module
+class Cifar100(object):
+
+    CLASSES = None
+
+    def __init__(self, root, split):
+        assert split in ['train', 'test']
+        try:
+            self.cifar = CIFAR100(
+                root=root, train=spilt == 'train', download=False)
+        except:
+            raise Exception("Please download CIFAR10 manually, \
+                  in case of downloading the dataset parallelly \
+                  that may corrupt the dataset.")
+        self.labels = self.cifar.targets
+
+    def get_sample(self, idx):
+        img = Image.fromarray(self.cifar.data[idx])
+        target = self.labels[idx]  # img: HWC, RGB
+        return img, target
diff --git a/openselfsup/datasets/data_sources/image_list.py b/openselfsup/datasets/data_sources/image_list.py
new file mode 100644
index 00000000..1a626c19
--- /dev/null
+++ b/openselfsup/datasets/data_sources/image_list.py
@@ -0,0 +1,36 @@
+import os
+from PIL import Image
+
+from ..registry import DATASOURCES
+from .utils import McLoader
+
+
+@DATASOURCES.register_module
+class ImageList(object):
+
+    def __init__(self, root, list_file, memcached, mclient_path):
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        self.fns = [os.path.join(root, l.strip()) for l in lines]
+        self.memcached = memcached
+        self.mclient_path = mclient_path
+        self.initialized = False
+
+    def _init_memcached(self):
+        if not self.initialized:
+            assert self.mclient_path is not None
+            self.mc_loader = McLoader(self.mclient_path)
+            self.initialized = True
+
+    def get_length(self):
+        return len(self.fns)
+
+    def get_sample(self, idx):
+        if self.memcached:
+            self._init_memcached()
+        if self.memcached:
+            img = self.mc_loader(self.fns[idx])
+        else:
+            img = Image.open(self.fns[idx])
+        img = img.convert('RGB')
+        return img
diff --git a/openselfsup/datasets/data_sources/imagenet.py b/openselfsup/datasets/data_sources/imagenet.py
new file mode 100644
index 00000000..6c58ea20
--- /dev/null
+++ b/openselfsup/datasets/data_sources/imagenet.py
@@ -0,0 +1,43 @@
+import os
+from PIL import Image
+
+from ..registry import DATASOURCES
+from .utils import McLoader
+
+
+@DATASOURCES.register_module
+class ImageNet(object):
+
+    def __init__(self, root, list_file, memcached, mclient_path):
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        self.has_labels = len(lines[0].split()) == 2
+        if self.has_labels:
+            self.fns, self.labels = zip(*[l.strip().split() for l in lines])
+            self.labels = [int(l) for l in self.labels]
+        else:
+            self.fns = [l.strip() for l in lines]
+        self.fns = [os.path.join(root, fn) for fn in self.fns]
+        self.memcached = memcached
+        self.mclient_path = mclient_path
+        self.initialized = False
+
+    def _init_memcached(self):
+        if not self.initialized:
+            assert self.mclient_path is not None
+            self.mc_loader = McLoader(self.mclient_path)
+            self.initialized = True
+
+    def get_length(self):
+        return len(self.fns)
+
+    def get_sample(self, idx):
+        if self.memcached:
+            self._init_memcached()
+        if self.memcached:
+            img = self.mc_loader(self.fns[idx])
+        else:
+            img = Image.open(self.fns[idx])
+        img = img.convert('RGB')
+        target = self.labels[idx] if self.has_labels else None
+        return img, target
diff --git a/openselfsup/datasets/data_sources/utils.py b/openselfsup/datasets/data_sources/utils.py
new file mode 100644
index 00000000..f5f5f246
--- /dev/null
+++ b/openselfsup/datasets/data_sources/utils.py
@@ -0,0 +1,36 @@
+import io
+from PIL import Image
+try:
+    import mc
+except ImportError as E:
+    pass
+
+
+def pil_loader(img_str):
+    buff = io.BytesIO(img_str)
+    return Image.open(buff)
+
+
+class McLoader(object):
+
+    def __init__(self, mclient_path):
+        assert mclient_path is not None, \
+            "Please specify 'data_mclient_path' in the config."
+        self.mclient_path = mclient_path
+        server_list_config_file = "{}/server_list.conf".format(
+            self.mclient_path)
+        client_config_file = "{}/client.conf".format(self.mclient_path)
+        self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file,
+                                                      client_config_file)
+
+    def __call__(self, fn):
+        try:
+            img_value = mc.pyvector()
+            self.mclient.Get(fn, img_value)
+            img_value_str = mc.ConvertBuffer(img_value)
+            img = pil_loader(img_value_str)
+        except:
+            print('Read image failed ({})'.format(fn))
+            return None
+        else:
+            return img
diff --git a/openselfsup/datasets/dataset_wrappers.py b/openselfsup/datasets/dataset_wrappers.py
new file mode 100644
index 00000000..e749cb07
--- /dev/null
+++ b/openselfsup/datasets/dataset_wrappers.py
@@ -0,0 +1,55 @@
+import numpy as np
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .registry import DATASETS
+
+
+@DATASETS.register_module
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    concat the group flag for image aspect ratio.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+    """
+
+    def __init__(self, datasets):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        if hasattr(datasets[0], 'flag'):
+            flags = []
+            for i in range(0, len(datasets)):
+                flags.append(datasets[i].flag)
+            self.flag = np.concatenate(flags)
+
+
+@DATASETS.register_module
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.tile(self.dataset.flag, times)
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        return self.times * self._ori_len
diff --git a/openselfsup/datasets/deepcluster.py b/openselfsup/datasets/deepcluster.py
new file mode 100644
index 00000000..b4928e35
--- /dev/null
+++ b/openselfsup/datasets/deepcluster.py
@@ -0,0 +1,29 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class DeepClusterDataset(BaseDataset):
+    """Dataset for DC and ODC.
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(DeepClusterDataset, self).__init__(data_source, pipeline)
+        # init clustering labels
+        self.labels = [-1 for _ in range(self.data_source.get_length())]
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        label = self.labels[idx]
+        img = self.pipeline(img)
+        return dict(img=img, pseudo_label=label, idx=idx)
+
+    def assign_labels(self, labels):
+        assert len(self.labels) == len(labels), \
+            "Inconsistent lenght of asigned labels, \
+            {} vs {}".format(len(self.labels), len(labels))
+        self.labels = labels[:]
+
+    def evaluate(self, scores, keyword, logger=None):
+
+        raise NotImplemented
diff --git a/openselfsup/datasets/extraction.py b/openselfsup/datasets/extraction.py
new file mode 100644
index 00000000..6d926df5
--- /dev/null
+++ b/openselfsup/datasets/extraction.py
@@ -0,0 +1,19 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ExtractDataset(BaseDataset):
+    """Dataset for feature extraction
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ExtractDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
diff --git a/openselfsup/datasets/loader/__init__.py b/openselfsup/datasets/loader/__init__.py
new file mode 100644
index 00000000..7d11a011
--- /dev/null
+++ b/openselfsup/datasets/loader/__init__.py
@@ -0,0 +1,7 @@
+from .build_loader import build_dataloader
+from .sampler import DistributedGroupSampler, GroupSampler, DistributedGivenIterationSampler
+
+__all__ = [
+    'GroupSampler', 'DistributedGroupSampler', 'build_dataloader',
+    'DistributedGivenIterationSampler'
+]
diff --git a/openselfsup/datasets/loader/build_loader.py b/openselfsup/datasets/loader/build_loader.py
new file mode 100644
index 00000000..fc6c8118
--- /dev/null
+++ b/openselfsup/datasets/loader/build_loader.py
@@ -0,0 +1,81 @@
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from torch.utils.data import DataLoader
+
+#from .sampler import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .sampler import DistributedSampler, DistributedGivenIterationSampler
+from torch.utils.data import RandomSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
+def build_dataloader(dataset,
+                     imgs_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     replace=False,
+                     seed=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        imgs_per_gpu (int): Number of images on each GPU, i.e., batch size of
+            each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        replace (bool): Replace or not in random shuffle.
+            It works on when shuffle is True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    if dist:
+        rank, world_size = get_dist_info()
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, replace=replace)
+        batch_size = imgs_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        if replace:
+            raise NotImplemented
+        sampler = RandomSampler(
+            dataset) if shuffle else None  # TODO: set replace
+        batch_size = num_gpus * imgs_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
+        pin_memory=False,
+        worker_init_fn=worker_init_fn if seed is not None else None,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(seed):
+    np.random.seed(seed)
+    random.seed(seed)
diff --git a/openselfsup/datasets/loader/sampler.py b/openselfsup/datasets/loader/sampler.py
new file mode 100644
index 00000000..2653e2f8
--- /dev/null
+++ b/openselfsup/datasets/loader/sampler.py
@@ -0,0 +1,299 @@
+from __future__ import division
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from torch.utils.data import Sampler
+
+
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 replace=False):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+        self.replace = replace
+        self.unif_sampling_flag = False
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if not self.unif_sampling_flag:
+            self.generate_new_list()
+        else:
+            self.unif_sampling_flag = False
+        return iter(self.indices[self.rank * self.num_samples:(self.rank + 1) *
+                                 self.num_samples])
+
+    def generate_new_list(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            if self.replace:
+                indices = torch.randint(
+                    low=0,
+                    high=len(self.dataset),
+                    size=(len(self.dataset), ),
+                    generator=g).tolist()
+            else:
+                indices = torch.randperm(
+                    len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        self.indices = indices
+
+    def set_uniform_indices(self, labels, num_classes):
+        self.unif_sampling_flag = True
+        assert self.shuffle, "Using uniform sampling, the indices must be shuffled."
+        np.random.seed(self.epoch)
+        assert (len(labels) == len(self.dataset))
+        N = len(labels)
+        size_per_label = int(N / num_classes) + 1
+        indices = []
+        images_lists = [[] for i in range(num_classes)]
+        for i, l in enumerate(labels):
+            images_lists[l].append(i)
+        for i, l in enumerate(images_lists):
+            if len(l) == 0:
+                continue
+            indices.extend(
+                np.random.choice(
+                    l, size_per_label, replace=(len(l) <= size_per_label)))
+        indices = np.array(indices)
+        np.random.shuffle(indices)
+        indices = indices[:N].astype(np.int).tolist()
+
+        # add extra samples to make it evenly divisible
+        assert len(indices) <= self.total_size, \
+            "{} vs {}".format(len(indices), self.total_size)
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size, \
+            "{} vs {}".format(len(indices), self.total_size)
+        self.indices = indices
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedGivenIterationSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 total_iter,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 last_iter=-1):
+        rank, world_size = get_dist_info()
+        assert rank < world_size
+        self.dataset = dataset
+        self.total_iter = total_iter
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.rank = rank
+        self.last_iter = last_iter
+
+        self.total_size = self.total_iter * self.batch_size
+
+        self.indices = self.gen_new_list()
+
+    def __iter__(self):
+        return iter(self.indices[(self.last_iter + 1) * self.batch_size:])
+
+    def set_uniform_indices(self, labels, num_classes):
+        np.random.seed(0)
+        assert (len(labels) == len(self.dataset))
+        N = len(labels)
+        size_per_label = int(N / num_classes) + 1
+        indices = []
+        images_lists = [[] for i in range(num_classes)]
+        for i, l in enumerate(labels):
+            images_lists[l].append(i)
+        for i, l in enumerate(images_lists):
+            if len(l) == 0:
+                continue
+            indices.extend(
+                np.random.choice(
+                    l, size_per_label, replace=(len(l) <= size_per_label)))
+        indices = np.array(indices)
+        np.random.shuffle(indices)
+        indices = indices[:N].astype(np.int)
+        # repeat
+        all_size = self.total_size * self.world_size
+        indices = indices[:all_size]
+        num_repeat = (all_size - 1) // indices.shape[0] + 1
+        indices = np.tile(indices, num_repeat)
+        indices = indices[:all_size]
+        np.random.shuffle(indices)
+        # slice
+        beg = self.total_size * self.rank
+        indices = indices[beg:beg + self.total_size]
+        assert len(indices) == self.total_size
+        # set
+        self.indices = indices
+
+    def gen_new_list(self):
+
+        # each process shuffle all list with same seed, and pick one piece according to rank
+        np.random.seed(0)
+
+        all_size = self.total_size * self.world_size
+        indices = np.arange(len(self.dataset))
+        indices = indices[:all_size]
+        num_repeat = (all_size - 1) // indices.shape[0] + 1
+        indices = np.tile(indices, num_repeat)
+        indices = indices[:all_size]
+
+        np.random.shuffle(indices)
+        beg = self.total_size * self.rank
+        indices = indices[beg:beg + self.total_size]
+
+        assert len(indices) == self.total_size
+
+        return indices
+
+    def __len__(self):
+        # note here we do not take last iter into consideration, since __len__
+        # should only be used for displaying, the correct remaining size is
+        # handled by dataloader
+        #return self.total_size - (self.last_iter+1)*self.batch_size
+        return self.total_size
+
+    def set_epoch(self, epoch):
+        pass
diff --git a/openselfsup/datasets/npid.py b/openselfsup/datasets/npid.py
new file mode 100644
index 00000000..4e0205eb
--- /dev/null
+++ b/openselfsup/datasets/npid.py
@@ -0,0 +1,20 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class NPIDDataset(BaseDataset):
+    """Dataset for NPID.
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(NPIDDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img, idx=idx)
+
+    def evaluate(self, scores, keyword, logger=None):
+
+        raise NotImplemented
diff --git a/openselfsup/datasets/pipelines/__init__.py b/openselfsup/datasets/pipelines/__init__.py
new file mode 100644
index 00000000..7986cdd6
--- /dev/null
+++ b/openselfsup/datasets/pipelines/__init__.py
@@ -0,0 +1 @@
+from .transforms import *
diff --git a/openselfsup/datasets/pipelines/transforms.py b/openselfsup/datasets/pipelines/transforms.py
new file mode 100644
index 00000000..c5715688
--- /dev/null
+++ b/openselfsup/datasets/pipelines/transforms.py
@@ -0,0 +1,92 @@
+import cv2
+import inspect
+import numpy as np
+from PIL import Image
+
+import torch
+from torchvision import transforms as _transforms
+
+from openselfsup.utils import build_from_cfg
+
+from ..registry import PIPELINES
+
+# register all existing transforms in torchvision
+for m in inspect.getmembers(_transforms, inspect.isclass):
+    PIPELINES.register_module(m[1])
+
+
+@PIPELINES.register_module
+class RandomAppliedTrans(object):
+    '''Randomly applied transformations.
+    Args:
+        transforms (List[Dict]): List of transformations in dictionaries.
+    '''
+
+    def __init__(self, transforms, p=0.5):
+        t = [build_from_cfg(t, PIPELINES) for t in transforms]
+        self.trans = _transforms.RandomApply(t, p=p)
+
+    def __call__(self, img):
+        return self.trans(img)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+# custom transforms
+@PIPELINES.register_module
+class Lighting(object):
+    """Lighting noise(AlexNet - style PCA - based noise)"""
+    _IMAGENET_PCA = {
+        'eigval':
+        torch.Tensor([0.2175, 0.0188, 0.0045]),
+        'eigvec':
+        torch.Tensor([
+            [-0.5675, 0.7192, 0.4009],
+            [-0.5808, -0.0045, -0.8140],
+            [-0.5836, -0.6948, 0.4203],
+        ])
+    }
+
+    def __init__(self):
+        self.alphastd = 0.1
+        self.eigval = self._IMAGENET_PCA['eigval']
+        self.eigvec = self._IMAGENET_PCA['eigvec']
+
+    def __call__(self, img):
+        assert isinstance(img, torch.Tensor), \
+            "Expect torch.Tensor, got {}".format(type(img))
+        if self.alphastd == 0:
+            return img
+
+        alpha = img.new().resize_(3).normal_(0, self.alphastd)
+        rgb = self.eigvec.type_as(img).clone()\
+            .mul(alpha.view(1, 3).expand(3, 3))\
+            .mul(self.eigval.view(1, 3).expand(3, 3))\
+            .sum(1).squeeze()
+
+        return img.add(rgb.view(3, 1, 1).expand_as(img))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+@PIPELINES.register_module
+class GaussianBlur(object):
+
+    def __init__(self, sigma_min, sigma_max, kernel_size):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.kernel_size = kernel_size
+
+    def __call__(self, img):
+        sigma = np.random.uniform(self.sigma_min, self.sigma_max)
+        img = cv2.GaussianBlur(
+            np.array(img), (self.kernel_size, self.kernel_size), sigma)
+        return Image.fromarray(img.astype(np.uint8))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
diff --git a/openselfsup/datasets/registry.py b/openselfsup/datasets/registry.py
new file mode 100644
index 00000000..48642783
--- /dev/null
+++ b/openselfsup/datasets/registry.py
@@ -0,0 +1,5 @@
+from openselfsup.utils import Registry
+
+DATASOURCES = Registry('datasource')
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
diff --git a/openselfsup/datasets/rotation_pred.py b/openselfsup/datasets/rotation_pred.py
new file mode 100644
index 00000000..0f90a34e
--- /dev/null
+++ b/openselfsup/datasets/rotation_pred.py
@@ -0,0 +1,35 @@
+import torch
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+def rotate(img):
+    '''
+    img: Tensor(CHW)
+    '''
+    return [
+        img,
+        torch.flip(img.transpose(1, 2), [1]),
+        torch.flip(img, [1, 2]),
+        torch.flip(img, [1]).transpose(1, 2)
+    ]
+
+
+@DATASETS.register_module
+class RotationPredDataset(BaseDataset):
+    """Dataset for rotation prediction 
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(RotationPredDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        img = torch.stack(rotate(img), dim=0)
+        rotation_labels = torch.LongTensor([0, 1, 2, 3])
+        return dict(img=img, rot_label=rotation_labels)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
diff --git a/openselfsup/hooks/__init__.py b/openselfsup/hooks/__init__.py
new file mode 100644
index 00000000..cdcd6cde
--- /dev/null
+++ b/openselfsup/hooks/__init__.py
@@ -0,0 +1,7 @@
+from .builder import build_hook
+from .deepcluster_hook import DeepClusterHook
+from .odc_hook import ODCHook
+from .optimizer_hook import DistOptimizerHook
+from .extractor import Extractor
+from .validate_hook import ValidateHook
+from .registry import HOOKS
diff --git a/openselfsup/hooks/builder.py b/openselfsup/hooks/builder.py
new file mode 100644
index 00000000..b56591bf
--- /dev/null
+++ b/openselfsup/hooks/builder.py
@@ -0,0 +1,7 @@
+from openselfsup.utils import build_from_cfg
+
+from .registry import HOOKS
+
+
+def build_hook(cfg, default_args=None):
+    return build_from_cfg(cfg, HOOKS, default_args)
diff --git a/openselfsup/hooks/deepcluster_hook.py b/openselfsup/hooks/deepcluster_hook.py
new file mode 100644
index 00000000..7ed56a0e
--- /dev/null
+++ b/openselfsup/hooks/deepcluster_hook.py
@@ -0,0 +1,109 @@
+import numpy as np
+
+from mmcv.runner import Hook
+
+import torch
+import torch.distributed as dist
+
+from openselfsup.third_party import clustering as _clustering
+from openselfsup.utils import print_log
+from .registry import HOOKS
+from .extractor import Extractor
+
+
+@HOOKS.register_module
+class DeepClusterHook(Hook):
+
+    def __init__(
+            self,
+            extractor,
+            clustering,
+            unif_sampling,
+            reweight,
+            reweight_pow,
+            init_memory=False,  # for ODC
+            initial=True,
+            interval=1,
+            dist_mode=True,
+            data_loaders=None):
+        self.extractor = Extractor(dist_mode=dist_mode, **extractor)
+        self.clustering_type = clustering.pop('type')
+        self.clustering_cfg = clustering
+        self.unif_sampling = unif_sampling
+        self.reweight = reweight
+        self.reweight_pow = reweight_pow
+        self.init_memory = init_memory
+        self.initial = initial
+        self.interval = interval
+        self.dist_mode = dist_mode
+        self.data_loaders = data_loaders
+
+    def before_run(self, runner):
+        if self.initial:
+            self.deepcluster(runner)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        self.deepcluster(runner)
+
+    def deepcluster(self, runner):
+        # step 1: get features
+        runner.model.eval()
+        features = self.extractor(runner)
+        runner.model.train()
+
+        # step 2: get labels
+        if not self.dist_mode or (self.dist_mode and runner.rank == 0):
+            clustering_algo = _clustering.__dict__[self.clustering_type](
+                **self.clustering_cfg)
+            # Features are normalized during clustering
+            clustering_algo.cluster(features, verbose=True)
+            assert isinstance(clustering_algo.labels, np.ndarray)
+            new_labels = clustering_algo.labels.astype(np.int64)
+            np.save(
+                "{}/cluster_epoch_{}.npy".format(runner.work_dir,
+                                                 runner.epoch), new_labels)
+            self.evaluate(runner, new_labels)
+        else:
+            new_labels = np.zeros((len(self.data_loaders[0].dataset), ),
+                                  dtype=np.int64)
+
+        if self.dist_mode:
+            new_labels_tensor = torch.from_numpy(new_labels).cuda()
+            dist.broadcast(new_labels_tensor, 0)
+            new_labels = new_labels_tensor.cpu().numpy()
+        new_labels_list = list(new_labels)
+
+        # step 3: assign new labels
+        self.data_loaders[0].dataset.assign_labels(new_labels_list)
+
+        # step 4 (a): set uniform sampler
+        if self.unif_sampling:
+            self.data_loaders[0].sampler.set_uniform_indices(
+                new_labels_list, self.clustering_cfg.k)
+
+        # step 4 (b): set loss reweight
+        if self.reweight:
+            runner.model.module.set_reweight(new_labels, self.reweight_pow)
+
+        # step 5: randomize classifier
+        runner.model.module.head.init_weights(init_linear='normal')
+        if self.dist_mode:
+            for p in runner.model.module.head.state_dict().values():
+                dist.broadcast(p, 0)
+
+        # step 6: init memory for ODC
+        if self.init_memory:
+            runner.model.module.memory_bank.init_memory(features, new_labels)
+
+    def evaluate(self, runner, new_labels):
+        hist = np.bincount(new_labels, minlength=self.clustering_cfg.k)
+        empty_cls = (hist == 0).sum()
+        minimal_cls_size, maximal_cls_size = hist.min(), hist.max()
+        if runner.rank == 0:
+            print_log(
+                "empty_num: {}\tmin_cluster: {}\tmax_cluster:{}".format(
+                    empty_cls.item(), minimal_cls_size.item(),
+                    maximal_cls_size.item()),
+                logger='root')
diff --git a/openselfsup/hooks/extractor.py b/openselfsup/hooks/extractor.py
new file mode 100644
index 00000000..6c001da5
--- /dev/null
+++ b/openselfsup/hooks/extractor.py
@@ -0,0 +1,50 @@
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from openselfsup.utils import nondist_forward_collect, dist_forward_collect
+
+
+class Extractor(object):
+
+    def __init__(self,
+                 dataset,
+                 imgs_per_gpu,
+                 workers_per_gpu,
+                 dist_mode=False):
+        from openselfsup import datasets
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset)
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.data_loader = datasets.build_dataloader(
+            self.dataset,
+            imgs_per_gpu,
+            workers_per_gpu,
+            dist=dist_mode,
+            shuffle=False)
+        self.dist_mode = dist_mode
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _forward_func(self, runner, **x):
+        backbone_feat = runner.model(mode='extract', **x)
+        last_layer_feat = runner.model.module.neck([backbone_feat[-1]])[0]
+        last_layer_feat = last_layer_feat.view(last_layer_feat.size(0), -1)
+        return dict(feature=last_layer_feat.cpu())
+
+    def __call__(self, runner):
+        func = lambda **x: self._forward_func(runner, **x)
+        if self.dist_mode:
+            feats = dist_forward_collect(
+                func,
+                self.data_loader,
+                runner.rank,
+                len(self.dataset),
+                ret_rank=-1)['feature']  # NxD
+        else:
+            feats = nondist_forward_collect(func, self.data_loader,
+                                            len(self.dataset))['feature']
+        return feats
diff --git a/openselfsup/hooks/odc_hook.py b/openselfsup/hooks/odc_hook.py
new file mode 100644
index 00000000..b059bb96
--- /dev/null
+++ b/openselfsup/hooks/odc_hook.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+from mmcv.runner import Hook
+
+from openselfsup.utils import print_log
+from .registry import HOOKS
+
+
+@HOOKS.register_module
+class ODCHook(Hook):
+
+    def __init__(self,
+                 centroids_update_interval,
+                 deal_with_small_clusters_interval,
+                 evaluate_interval,
+                 reweight,
+                 reweight_pow,
+                 dist_mode=True):
+        assert dist_mode, "non-dist mode is not implemented"
+        self.centroids_update_interval = centroids_update_interval
+        self.deal_with_small_clusters_interval = \
+            deal_with_small_clusters_interval
+        self.evaluate_interval = evaluate_interval
+        self.reweight = reweight
+        self.reweight_pow = reweight_pow
+
+    def after_train_iter(self, runner):
+        # centroids update
+        if self.every_n_iters(runner, self.centroids_update_interval):
+            runner.model.module.memory_bank.update_centroids_memory()
+
+        # deal with small clusters
+        if self.every_n_iters(runner, self.deal_with_small_clusters_interval):
+            runner.model.module.memory_bank.deal_with_small_clusters()
+
+        # reweight
+        runner.model.module.set_reweight()
+
+        # evaluate
+        if self.every_n_iters(runner, self.evaluate_interval):
+            new_labels = runner.model.module.memory_bank.label_bank
+            if new_labels.is_cuda:
+                new_labels = new_labels.cpu()
+            self.evaluate(runner, new_labels.numpy())
+
+    def after_train_epoch(self, runner):
+        # save cluster
+        if self.every_n_epochs(10) and runner.rank == 0:
+            new_labels = runner.model.module.memory_bank.label_bank
+            if new_labels.is_cuda:
+                new_labels = new_labels.cpu()
+            np.save(
+                "{}/cluster_epoch_{}.npy".format(runner.work_dir,
+                                                 runner.epoch),
+                new_labels.numpy())
+
+    def evaluate(self, runner, new_labels):
+        hist = np.bincount(
+            new_labels, minlength=runner.model.module.memory_bank.num_classes)
+        empty_cls = (hist == 0).sum()
+        minimal_cls_size, maximal_cls_size = hist.min(), hist.max()
+        if runner.rank == 0:
+            print_log(
+                "empty_num: {}\tmin_cluster: {}\tmax_cluster:{}".format(
+                    empty_cls.item(), minimal_cls_size.item(),
+                    maximal_cls_size.item()),
+                logger='root')
diff --git a/openselfsup/hooks/optimizer_hook.py b/openselfsup/hooks/optimizer_hook.py
new file mode 100644
index 00000000..e8c1b7c9
--- /dev/null
+++ b/openselfsup/hooks/optimizer_hook.py
@@ -0,0 +1,16 @@
+from mmcv.runner import OptimizerHook
+
+
+class DistOptimizerHook(OptimizerHook):
+
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
diff --git a/openselfsup/hooks/registry.py b/openselfsup/hooks/registry.py
new file mode 100644
index 00000000..1f196dc8
--- /dev/null
+++ b/openselfsup/hooks/registry.py
@@ -0,0 +1,3 @@
+from openselfsup.utils import Registry
+
+HOOKS = Registry('hook')
diff --git a/openselfsup/hooks/validate_hook.py b/openselfsup/hooks/validate_hook.py
new file mode 100644
index 00000000..45efd9e2
--- /dev/null
+++ b/openselfsup/hooks/validate_hook.py
@@ -0,0 +1,71 @@
+from mmcv.runner import Hook
+
+import torch
+from torch.utils.data import Dataset
+
+from openselfsup.utils import nondist_forward_collect, dist_forward_collect
+from .registry import HOOKS
+
+
+@HOOKS.register_module
+class ValidateHook(Hook):
+
+    def __init__(self,
+                 dataset,
+                 dist_mode=True,
+                 initial=True,
+                 interval=1,
+                 **eval_kwargs):
+        from openselfsup import datasets
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset)
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.data_loader = datasets.build_dataloader(
+            self.dataset,
+            eval_kwargs['imgs_per_gpu'],
+            eval_kwargs['workers_per_gpu'],
+            dist=dist_mode,
+            shuffle=False)
+        self.dist_mode = dist_mode
+        self.initial = initial
+        self.interval = interval
+        self.eval_kwargs = eval_kwargs
+
+    def before_run(self, runner):
+        if self.initial:
+            self._run_validate(runner)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        self._run_validate(runner)
+
+    def _run_validate(self, runner):
+        runner.model.eval()
+        func = lambda **x: runner.model(mode='test', **x)
+        if self.dist_mode:
+            results = dist_forward_collect(
+                func, self.data_loader, runner.rank,
+                len(self.dataset))  # dict{key: np.ndarray}
+        else:
+            results = nondist_forward_collect(func, self.data_loader,
+                                              len(self.dataset))
+        if runner.rank == 0:
+            for name, val in results.items():
+                self._evaluate(runner, torch.from_numpy(val), name)
+        runner.model.train()
+
+    def _evaluate(self, runner, results, keyword):
+        eval_res = self.dataset.evaluate(
+            results,
+            keyword=keyword,
+            logger=runner.logger,
+            **self.eval_kwargs['eval_param'])
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
diff --git a/openselfsup/models/__init__.py b/openselfsup/models/__init__.py
new file mode 100644
index 00000000..80813ea3
--- /dev/null
+++ b/openselfsup/models/__init__.py
@@ -0,0 +1,20 @@
+from .backbones import *  # noqa: F401,F403
+from .builder import (build_backbone, build_model, build_head, build_loss)
+from .heads import *
+from .classification import Classification
+from .deepcluster import DeepCluster
+from .odc import ODC
+from .losses import *  # noqa: F401,F403
+from .necks import *
+from .npid import NPID
+from .memories import *
+from .moco import MOCO
+from .registry import (BACKBONES, MODELS, NECKS, MEMORIES, HEADS, LOSSES)
+from .rotation_pred import RotationPred
+from .simclr import SimCLR
+
+#__all__ = [
+#    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+#    'DETECTORS', 'CLASSIFIERS', 'build_backbone', 'build_neck', 'build_roi_extractor',
+#    'build_shared_head', 'build_head', 'build_loss', 'build_detector', 'build_detector'
+#]
diff --git a/openselfsup/models/backbones/__init__.py b/openselfsup/models/backbones/__init__.py
new file mode 100644
index 00000000..d718d076
--- /dev/null
+++ b/openselfsup/models/backbones/__init__.py
@@ -0,0 +1,6 @@
+#from .hrnet import HRNet
+from .resnet import ResNet, make_res_layer
+#from .resnext import ResNeXt
+#from .ssd_vgg import SSDVGG
+
+#__all__ = ['ResNet', 'make_res_layer', 'ResNeXt', 'SSDVGG', 'HRNet']
diff --git a/openselfsup/models/backbones/resnet.py b/openselfsup/models/backbones/resnet.py
new file mode 100644
index 00000000..db2c5fe0
--- /dev/null
+++ b/openselfsup/models/backbones/resnet.py
@@ -0,0 +1,429 @@
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from openselfsup.utils import get_root_logger
+from ..registry import BACKBONES
+from ..utils import build_conv_layer, build_norm_layer
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super(BasicBlock, self).__init__()
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        """Bottleneck block for ResNet.
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='pytorch',
+                   with_cp=False,
+                   conv_cfg=None,
+                   norm_cfg=dict(type='BN')):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1],
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes=inplanes,
+            planes=planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            style=style,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=1,
+                dilation=dilation,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from openselfsup.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3, 4),
+                 style='pytorch',
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False):
+        super(ResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError('invalid depth {} for resnet'.format(depth))
+        self.depth = depth
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages + 1
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = 64
+
+        self._make_stem_layer(in_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * 64 * 2**(
+            len(self.stage_blocks) - 1)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.norm1.eval()
+            for m in [self.conv1, self.norm1]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, 'layer{}'.format(i))
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)  # r50: 64x128x128
+        if 0 in self.out_indices:
+            outs.append(x)
+        x = self.maxpool(x)  # r50: 64x56x56
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i + 1 in self.out_indices:
+                outs.append(x)
+        # r50: 1-256x56x56; 2-512x28x28; 3-1024x14x14; 4-2048x7x7
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/openselfsup/models/backbones/resnext.py b/openselfsup/models/backbones/resnext.py
new file mode 100644
index 00000000..326e8827
--- /dev/null
+++ b/openselfsup/models/backbones/resnext.py
@@ -0,0 +1,222 @@
+import math
+
+import torch.nn as nn
+
+from ..registry import BACKBONES
+from ..utils import build_conv_layer, build_norm_layer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+
+    def __init__(self, inplanes, planes, groups=1, base_width=4, **kwargs):
+        """Bottleneck block for ResNeXt.
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes * (base_width / 64)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   groups=1,
+                   base_width=4,
+                   style='pytorch',
+                   with_cp=False,
+                   conv_cfg=None,
+                   norm_cfg=dict(type='BN'),
+                   dcn=None,
+                   gcb=None):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1],
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes=inplanes,
+            planes=planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            groups=groups,
+            base_width=base_width,
+            style=style,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            dcn=dcn,
+            gcb=gcb))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=1,
+                dilation=dilation,
+                groups=groups,
+                base_width=base_width,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                gcb=gcb))
+
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from openselfsup.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        super(ResNeXt, self).__init__(**kwargs)
+        self.groups = groups
+        self.base_width = base_width
+
+        self.inplanes = 64
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            gcb = self.gcb if self.stage_with_gcb[i] else None
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                groups=self.groups,
+                base_width=self.base_width,
+                style=self.style,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                gcb=gcb)
+            self.inplanes = planes * self.block.expansion
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
diff --git a/openselfsup/models/builder.py b/openselfsup/models/builder.py
new file mode 100644
index 00000000..4d2524f5
--- /dev/null
+++ b/openselfsup/models/builder.py
@@ -0,0 +1,38 @@
+from torch import nn
+
+from openselfsup.utils import build_from_cfg
+from .registry import (BACKBONES, MODELS, NECKS, HEADS, MEMORIES, LOSSES)
+
+
+def build(cfg, registry, default_args=None):
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return nn.Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_backbone(cfg):
+    return build(cfg, BACKBONES)
+
+
+def build_neck(cfg):
+    return build(cfg, NECKS)
+
+
+def build_memory(cfg):
+    return build(cfg, MEMORIES)
+
+
+def build_head(cfg):
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    return build(cfg, LOSSES)
+
+
+def build_model(cfg):
+    return build(cfg, MODELS)
diff --git a/openselfsup/models/classification.py b/openselfsup/models/classification.py
new file mode 100644
index 00000000..81a1a25f
--- /dev/null
+++ b/openselfsup/models/classification.py
@@ -0,0 +1,79 @@
+import numpy as np
+
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+from .utils import Sobel
+
+
+@MODELS.register_module
+class Classification(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 frozen_backbone=False,
+                 with_sobel=False,
+                 head=None,
+                 pretrained=None):
+        super(Classification, self).__init__()
+        self.with_sobel = with_sobel
+        if with_sobel:
+            self.sobel_layer = Sobel()
+        self.backbone = builder.build_backbone(backbone)
+        if frozen_backbone:
+            self.backbone.eval()
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+        if head is not None:
+            self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.head.init_weights()
+
+    def forward_backbone(self, img):
+        """Forward backbone
+
+        Returns:
+            x (tuple): backbone outputs
+        """
+        if self.with_sobel:
+            img = self.sobel_layer(img)
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, gt_label, **kwargs):
+        x = self.forward_backbone(img)
+        outs = self.head(x)
+        loss_inputs = (outs, gt_label)
+        losses = self.head.loss(*loss_inputs)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def aug_test(self, imgs):
+        raise NotImplemented
+        outs = np.mean([self.head(x) for x in self.forward_backbone(imgs)],
+                       axis=0)
+        return outs
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
diff --git a/openselfsup/models/deepcluster.py b/openselfsup/models/deepcluster.py
new file mode 100644
index 00000000..b286d784
--- /dev/null
+++ b/openselfsup/models/deepcluster.py
@@ -0,0 +1,88 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+from .utils import Sobel
+
+
+@MODELS.register_module
+class DeepCluster(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 with_sobel=False,
+                 neck=None,
+                 head=None,
+                 pretrained=None):
+        super(DeepCluster, self).__init__()
+        self.with_sobel = with_sobel
+        if with_sobel:
+            self.sobel_layer = Sobel()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck)
+        if head is not None:
+            self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+        # reweight
+        self.num_classes = head.num_classes
+        self.loss_weight = torch.ones((self.num_classes, ),
+                                      dtype=torch.float32).cuda()
+        self.loss_weight /= self.loss_weight.sum()
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.neck.init_weights(init_linear='kaiming')
+        self.head.init_weights(init_linear='normal')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+    
+        Returns:
+            x (tuple): backbone outputs
+        """
+        if self.with_sobel:
+            img = self.sobel_layer(img)
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, pseudo_label, **kwargs):
+        x = self.forward_backbone(img)
+        assert len(x) == 1
+        feature = self.neck(x)
+        outs = self.head(feature)
+        loss_inputs = (outs, pseudo_label)
+        losses = self.head.loss(*loss_inputs)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+    def set_reweight(self, labels, reweight_pow=0.5):
+        hist = np.bincount(
+            labels, minlength=self.num_classes).astype(np.float32)
+        inv_hist = (1. / (hist + 1e-10))**reweight_pow
+        weight = inv_hist / inv_hist.sum()
+        self.loss_weight.copy_(torch.from_numpy(weight))
+        self.head.criterion = nn.CrossEntropyLoss(weight=self.loss_weight)
diff --git a/openselfsup/models/heads/__init__.py b/openselfsup/models/heads/__init__.py
new file mode 100644
index 00000000..c6bd865c
--- /dev/null
+++ b/openselfsup/models/heads/__init__.py
@@ -0,0 +1,3 @@
+from .contrastive_head import ContrastiveHead
+from .cls_head import ClsHead
+from .multi_cls_head import MultiClsHead
diff --git a/openselfsup/models/heads/cls_head.py b/openselfsup/models/heads/cls_head.py
new file mode 100644
index 00000000..b225718d
--- /dev/null
+++ b/openselfsup/models/heads/cls_head.py
@@ -0,0 +1,60 @@
+import torch.nn as nn
+from mmcv.cnn import kaiming_init, normal_init
+
+from ..utils import accuracy
+from ..registry import HEADS
+
+
+@HEADS.register_module
+class ClsHead(nn.Module):
+    """Simplest classifier head, with only one fc layer.
+    """
+
+    def __init__(self,
+                 with_avg_pool=False,
+                 in_channels=2048,
+                 num_classes=1000):
+        super(ClsHead, self).__init__()
+        self.with_avg_pool = with_avg_pool
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        self.criterion = nn.CrossEntropyLoss()
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc_cls = nn.Linear(in_channels, num_classes)
+
+    def init_weights(self, init_linear='normal'):
+        assert init_linear in ['normal', 'kaiming'], \
+            "Undefined init_linear: {}".format(init_linear)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                if init_linear == 'normal':
+                    normal_init(m, std=0.01)
+                else:
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert isinstance(x, (tuple, list)) and len(x) == 1
+        x = x[0]
+        if self.with_avg_pool:
+            assert x.dim() == 4, \
+                "Tensor must has 4 dims, got: {}".format(x.dim())
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x)
+        return [cls_score]
+
+    def loss(self, cls_score, labels):
+        losses = dict()
+        assert isinstance(cls_score, (tuple, list)) and len(cls_score) == 1
+        losses['loss'] = self.criterion(cls_score[0], labels)
+        losses['acc'] = accuracy(cls_score[0], labels)
+        return losses
diff --git a/openselfsup/models/heads/contrastive_head.py b/openselfsup/models/heads/contrastive_head.py
new file mode 100644
index 00000000..b0455293
--- /dev/null
+++ b/openselfsup/models/heads/contrastive_head.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+from ..registry import HEADS
+
+
+@HEADS.register_module
+class ContrastiveHead(nn.Module):
+    '''Head for contrastive learning.
+    '''
+
+    def __init__(self, temperature=0.1):
+        super(ContrastiveHead, self).__init__()
+        self.criterion = nn.CrossEntropyLoss()
+        self.temperature = temperature
+
+    def forward(self, pos, neg):
+        '''
+        Args:
+            pos (Tensor): Nx1 positive similarity
+            neg (Tensor): Nxk negative similarity
+        '''
+        N = pos.size(0)
+        logits = torch.cat((pos, neg), dim=1)
+        logits /= self.temperature
+        labels = torch.zeros((N, ), dtype=torch.long).cuda()
+        losses = dict()
+        losses['loss'] = self.criterion(logits, labels)
+        return losses
diff --git a/openselfsup/models/heads/multi_cls_head.py b/openselfsup/models/heads/multi_cls_head.py
new file mode 100644
index 00000000..babe5649
--- /dev/null
+++ b/openselfsup/models/heads/multi_cls_head.py
@@ -0,0 +1,77 @@
+import torch.nn as nn
+
+from ..utils import accuracy
+from ..registry import HEADS
+from ..utils import build_norm_layer, MultiPooling
+
+
+@HEADS.register_module
+class MultiClsHead(nn.Module):
+    """Multiple classifier heads.
+    """
+    FEAT_CHANNELS = {'resnet50': [64, 256, 512, 1024, 2048]}
+    FEAT_LAST_UNPOOL = {'resnet50': 2048 * 7 * 7}
+
+    def __init__(self,
+                 pool_type='adaptive',
+                 in_indices=(0, ),
+                 with_last_layer_unpool=False,
+                 backbone='resnet50',
+                 norm_cfg=dict(type='BN'),
+                 num_classes=1000):
+        super(MultiClsHead, self).__init__()
+        assert norm_cfg['type'] in ['BN', 'SyncBN', 'GN', 'null']
+
+        self.with_last_layer_unpool = with_last_layer_unpool
+        self.with_norm = norm_cfg['type'] != 'null'
+
+        self.criterion = nn.CrossEntropyLoss()
+
+        self.multi_pooling = MultiPooling(pool_type, in_indices, backbone)
+
+        if self.with_norm:
+            self.norms = nn.ModuleList([
+                build_norm_layer(norm_cfg, self.FEAT_CHANNELS[backbone][l])[1]
+                for l in in_indices
+            ])
+
+        self.fcs = nn.ModuleList([
+            nn.Linear(self.multi_pooling.POOL_DIMS[backbone][l], num_classes)
+            for l in in_indices
+        ])
+        if with_last_layer_unpool:
+            self.fcs.append(
+                nn.Linear(self.FEAT_LAST_UNPOOL[backbone], num_classes))
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert isinstance(x, (list, tuple))
+        if self.with_last_layer_unpool:
+            last_x = x[-1]
+        x = self.multi_pooling(x)
+        if self.with_norm:
+            x = [n(xx) for n, xx in zip(self.norms, x)]
+        if self.with_last_layer_unpool:
+            x.append(last_x)
+        x = [xx.view(xx.size(0), -1) for xx in x]
+        x = [fc(xx) for fc, xx in zip(self.fcs, x)]
+        return x
+
+    def loss(self, cls_score, labels):
+        losses = dict()
+        for i, s in enumerate(cls_score):
+            # keys must contain "loss"
+            losses['loss.{}'.format(i + 1)] = self.criterion(s, labels)
+            losses['acc.{}'.format(i + 1)] = accuracy(s, labels)
+        return losses
diff --git a/openselfsup/models/losses/__init__.py b/openselfsup/models/losses/__init__.py
new file mode 100644
index 00000000..bfa523d6
--- /dev/null
+++ b/openselfsup/models/losses/__init__.py
@@ -0,0 +1,19 @@
+#from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+#from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+#                                 cross_entropy, mask_cross_entropy)
+#from .focal_loss import FocalLoss, sigmoid_focal_loss
+#from .ghm_loss import GHMC, GHMR
+#from .iou_loss import (BoundedIoULoss, GIoULoss, IoULoss, bounded_iou_loss,
+#                       iou_loss)
+#from .mse_loss import MSELoss, mse_loss
+#from .smooth_l1_loss import SmoothL1Loss, smooth_l1_loss
+#from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+
+#__all__ = [
+#    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+#    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+#    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+#    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+#    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'GHMC', 'GHMR', 'reduce_loss',
+#    'weight_reduce_loss', 'weighted_loss'
+#]
diff --git a/openselfsup/models/losses/cross_entropy_loss.py b/openselfsup/models/losses/cross_entropy_loss.py
new file mode 100644
index 00000000..dd9d4776
--- /dev/null
+++ b/openselfsup/models/losses/cross_entropy_loss.py
@@ -0,0 +1,103 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..registry import LOSSES
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None):
+    # element-wise losses
+    loss = F.cross_entropy(pred, label, reduction='none')
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_binary_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(labels >= 1).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds] - 1] = 1
+    if label_weights is None:
+        bin_label_weights = None
+    else:
+        bin_label_weights = label_weights.view(-1, 1).expand(
+            label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None):
+    if pred.dim() != label.dim():
+        label, weight = _expand_binary_labels(label, weight, pred.size(-1))
+
+    # weighted element-wise losses
+    if weight is not None:
+        weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(loss, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred, target, label, reduction='mean', avg_factor=None):
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, reduction='mean')[None]
+
+
+@LOSSES.register_module
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
diff --git a/openselfsup/models/losses/focal_loss.py b/openselfsup/models/losses/focal_loss.py
new file mode 100644
index 00000000..2cbf2edd
--- /dev/null
+++ b/openselfsup/models/losses/focal_loss.py
@@ -0,0 +1,82 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from openselfsup.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+from ..registry import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred, target, gamma, alpha)
+    # TODO: find a proper way to handle the shape of weight
+    if weight is not None:
+        weight = weight.view(-1, 1)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * sigmoid_focal_loss(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/openselfsup/models/losses/ghm_loss.py b/openselfsup/models/losses/ghm_loss.py
new file mode 100644
index 00000000..e62b9904
--- /dev/null
+++ b/openselfsup/models/losses/ghm_loss.py
@@ -0,0 +1,171 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..registry import LOSSES
+
+
+def _expand_binary_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(labels >= 1).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds] - 1] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    "Gradient Harmonized Single-stage Detector".
+    https://arxiv.org/abs/1811.05181
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+    """
+
+    def __init__(self, bins=10, momentum=0, use_sigmoid=True, loss_weight=1.0):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, label_weight, *args, **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_binary_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, weights, reduction='sum') / tot
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    "Gradient Harmonized Single-stage Detector"
+    https://arxiv.org/abs/1811.05181
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+    """
+
+    def __init__(self, mu=0.02, bins=10, momentum=0, loss_weight=1.0):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+
+    # TODO: support reduction parameter
+    def forward(self, pred, target, label_weight, avg_factor=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+
+        loss = loss * weights
+        loss = loss.sum() / tot
+        return loss * self.loss_weight
diff --git a/openselfsup/models/losses/utils.py b/openselfsup/models/losses/utils.py
new file mode 100644
index 00000000..3361c6ca
--- /dev/null
+++ b/openselfsup/models/losses/utils.py
@@ -0,0 +1,98 @@
+import functools
+
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Avarage factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/openselfsup/models/memories/__init__.py b/openselfsup/models/memories/__init__.py
new file mode 100644
index 00000000..21a0c929
--- /dev/null
+++ b/openselfsup/models/memories/__init__.py
@@ -0,0 +1,3 @@
+from .odc_memory import ODCMemory
+from .odc_memory_gpu import ODCMemoryGPU
+from .simple_memory import SimpleMemory
diff --git a/openselfsup/models/memories/odc_memory.py b/openselfsup/models/memories/odc_memory.py
new file mode 100644
index 00000000..d5e6b624
--- /dev/null
+++ b/openselfsup/models/memories/odc_memory.py
@@ -0,0 +1,217 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class ODCMemory(nn.Module):
+
+    def __init__(self, length, feat_dim, momentum, num_classes, min_cluster,
+                 **kwargs):
+        super(ODCMemory, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        if self.rank == 0:
+            self.feature_bank = torch.zeros((length, feat_dim),
+                                            dtype=torch.float32)
+        self.label_bank = torch.zeros((length, ), dtype=torch.long)
+        self.centroids = torch.zeros((num_classes, feat_dim),
+                                     dtype=torch.float32).cuda()
+        self.kmeans = KMeans(n_clusters=2, random_state=0, max_iter=20)
+        self.feat_dim = feat_dim
+        self.initialized = False
+        self.momentum = momentum
+        self.num_classes = num_classes
+        self.min_cluster = min_cluster
+        self.debug = kwargs.get('debug', False)
+
+    def init_memory(self, feature, label):
+        self.initialized = True
+        self.label_bank.copy_(torch.from_numpy(label).long())
+        # make sure no empty clusters
+        assert (np.bincount(label, minlength=self.num_classes) != 0).all()
+        if self.rank == 0:
+            feature /= (np.linalg.norm(feature, axis=1).reshape(-1, 1) + 1e-10)
+            self.feature_bank.copy_(torch.from_numpy(feature))
+            centroids = self._compute_centroids()
+            self.centroids.copy_(centroids)
+        dist.broadcast(self.centroids, 0)
+
+    def _compute_centroids_ind(self, cinds):
+        '''compute a few centroids'''
+        assert self.rank == 0
+        num = len(cinds)
+        centroids = torch.zeros((num, self.feat_dim), dtype=torch.float32)
+        for i, c in enumerate(cinds):
+            ind = np.where(self.label_bank.numpy() == c)[0]
+            centroids[i, :] = self.feature_bank[ind, :].mean(dim=0)
+        return centroids
+
+    def _compute_centroids(self):
+        '''compute all non-empty centroids'''
+        assert self.rank == 0
+        l = self.label_bank.numpy()
+        argl = np.argsort(l)
+        sortl = l[argl]
+        diff_pos = np.where(sortl[1:] - sortl[:-1] != 0)[0] + 1
+        start = np.insert(diff_pos, 0, 0)
+        end = np.insert(diff_pos, len(diff_pos), len(l))
+        class_start = sortl[start]
+        # keep empty class centroids unchanged
+        centroids = self.centroids.cpu().clone()
+        for i, st, ed in zip(class_start, start, end):
+            centroids[i, :] = self.feature_bank[argl[st:ed], :].mean(dim=0)
+        return centroids
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        #if not hasattr(self, 'ind_gathered'):
+        #    self.ind_gathered = [torch.ones_like(ind).cuda()
+        #                         for _ in range(self.num_replicas)]
+        #if not hasattr(self, 'feature_gathered'):
+        #    self.feature_gathered = [torch.ones_like(feature).cuda()
+        #                             for _ in range(self.num_replicas)]
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
+
+    def update_samples_memory(self, ind, feature):  # ind, feature: cuda tensor
+        assert self.initialized
+        feature_norm = feature / (feature.norm(dim=1).view(-1, 1) + 1e-10
+                                  )  # normalize
+        ind, feature_norm = self._gather(
+            ind, feature_norm)  # ind: (N*w), feature: (N*w)xk, cuda tensor
+        ind = ind.cpu()
+        if self.rank == 0:
+            feature_old = self.feature_bank[ind, ...].cuda()
+            feature_new = (1 - self.momentum) * feature_old + \
+                self.momentum * feature_norm
+            feature_norm = feature_new / (
+                feature_new.norm(dim=1).view(-1, 1) + 1e-10)
+            self.feature_bank[ind, ...] = feature_norm.cpu()
+        dist.barrier()
+        dist.broadcast(feature_norm, 0)
+        # compute new labels
+        similarity_to_centroids = torch.mm(self.centroids,
+                                           feature_norm.permute(1, 0))  # CxN
+        newlabel = similarity_to_centroids.argmax(dim=0)  # cuda tensor
+        newlabel_cpu = newlabel.cpu()
+        change_ratio = (newlabel_cpu !=
+            self.label_bank[ind]).sum().float().cuda() \
+            / float(newlabel_cpu.shape[0])
+        self.label_bank[ind] = newlabel_cpu.clone()  # copy to cpu
+        return change_ratio
+
+    def deal_with_small_clusters(self):
+        # check empty class
+        hist = np.bincount(self.label_bank.numpy(), minlength=self.num_classes)
+        small_clusters = np.where(hist < self.min_cluster)[0].tolist()
+        if self.debug and self.rank == 0:
+            print("mincluster: {}, num of small class: {}".format(
+                hist.min(), len(small_clusters)))
+        if len(small_clusters) == 0:
+            return
+        # re-assign samples in small clusters to make them empty
+        for s in small_clusters:
+            ind = np.where(self.label_bank.numpy() == s)[0]
+            if len(ind) > 0:
+                inclusion = torch.from_numpy(
+                    np.setdiff1d(
+                        np.arange(self.num_classes),
+                        np.array(small_clusters),
+                        assume_unique=True)).cuda()
+                if self.rank == 0:
+                    target_ind = torch.mm(
+                        self.centroids[inclusion, :],
+                        self.feature_bank[ind, :].cuda().permute(
+                            1, 0)).argmax(dim=0)
+                    target = inclusion[target_ind]
+                else:
+                    target = torch.zeros((ind.shape[0], ),
+                                         dtype=torch.int64).cuda()
+                dist.all_reduce(target)
+                self.label_bank[ind] = torch.from_numpy(target.cpu().numpy())
+        # deal with empty cluster
+        self._redirect_empty_clusters(small_clusters)
+
+    def update_centroids_memory(self, cinds=None):
+        if self.rank == 0:
+            if self.debug:
+                print("updating centroids ...")
+            if cinds is None:
+                center = self._compute_centroids()
+                self.centroids.copy_(center)
+            else:
+                center = self._compute_centroids_ind(cinds)
+                self.centroids[
+                    torch.LongTensor(cinds).cuda(), :] = center.cuda()
+        dist.broadcast(self.centroids, 0)
+
+    def _partition_max_cluster(self, max_cluster):
+        assert self.rank == 0
+        max_cluster_inds = np.where(self.label_bank == max_cluster)[0]
+
+        assert len(max_cluster_inds) >= 2
+        max_cluster_features = self.feature_bank[max_cluster_inds, :]
+        if np.any(np.isnan(max_cluster_features.numpy())):
+            raise Exception("Has nan in features.")
+        kmeans_ret = self.kmeans.fit(max_cluster_features)
+        sub_cluster1_ind = max_cluster_inds[kmeans_ret.labels_ == 0]
+        sub_cluster2_ind = max_cluster_inds[kmeans_ret.labels_ == 1]
+        if not (len(sub_cluster1_ind) > 0 and len(sub_cluster2_ind) > 0):
+            print(
+                "Warning: kmeans partition fails, resort to random partition.")
+            sub_cluster1_ind = np.random.choice(
+                max_cluster_inds, len(max_cluster_inds) // 2, replace=False)
+            sub_cluster2_ind = np.setdiff1d(
+                max_cluster_inds, sub_cluster1_ind, assume_unique=True)
+        return sub_cluster1_ind, sub_cluster2_ind
+
+    def _redirect_empty_clusters(self, empty_clusters):
+        for e in empty_clusters:
+            assert (self.label_bank != e).all().item(), \
+                "Cluster #{} is not an empty cluster.".format(e)
+            max_cluster = np.bincount(
+                self.label_bank, minlength=self.num_classes).argmax().item()
+            # gather partitioning indices
+            if self.rank == 0:
+                sub_cluster1_ind, sub_cluster2_ind = self._partition_max_cluster(
+                    max_cluster)
+                size1 = torch.LongTensor([len(sub_cluster1_ind)]).cuda()
+                size2 = torch.LongTensor([len(sub_cluster2_ind)]).cuda()
+                sub_cluster1_ind_tensor = torch.from_numpy(
+                    sub_cluster1_ind).long().cuda()
+                sub_cluster2_ind_tensor = torch.from_numpy(
+                    sub_cluster2_ind).long().cuda()
+            else:
+                size1 = torch.LongTensor([0]).cuda()
+                size2 = torch.LongTensor([0]).cuda()
+            dist.all_reduce(size1)
+            dist.all_reduce(size2)
+            if self.rank != 0:
+                sub_cluster1_ind_tensor = torch.zeros(
+                    (size1, ), dtype=torch.int64).cuda()
+                sub_cluster2_ind_tensor = torch.zeros(
+                    (size2, ), dtype=torch.int64).cuda()
+            dist.broadcast(sub_cluster1_ind_tensor, 0)
+            dist.broadcast(sub_cluster2_ind_tensor, 0)
+            if self.rank != 0:
+                sub_cluster1_ind = sub_cluster1_ind_tensor.cpu().numpy()
+                sub_cluster2_ind = sub_cluster2_ind_tensor.cpu().numpy()
+
+            # reassign samples in partition #2 to the empty class
+            self.label_bank[sub_cluster2_ind] = e
+            # update centroids of max_cluster and e
+            self.update_centroids_memory([max_cluster, e])
diff --git a/openselfsup/models/memories/odc_memory_gpu.py b/openselfsup/models/memories/odc_memory_gpu.py
new file mode 100644
index 00000000..be078566
--- /dev/null
+++ b/openselfsup/models/memories/odc_memory_gpu.py
@@ -0,0 +1,190 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class ODCMemoryGPU(nn.Module):
+    '''Memory bank for Online Deep Clustering. Feature bank stored in GPU.
+    '''
+
+    def __init__(self, length, feat_dim, momentum, num_classes, min_cluster,
+                 **kwargs):
+        super(ODCMemoryGPU, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        self.feature_bank = torch.zeros((length, feat_dim),
+                                        dtype=torch.float32).cuda()
+        self.label_bank = torch.zeros((length, ), dtype=torch.long).cuda()
+        self.centroids = torch.zeros((num_classes, feat_dim),
+                                     dtype=torch.float32).cuda()
+        self.kmeans = KMeans(n_clusters=2, random_state=0, max_iter=20)
+        self.feat_dim = feat_dim
+        self.initialized = False
+        self.momentum = momentum
+        self.num_classes = num_classes
+        self.min_cluster = min_cluster
+        self.debug = kwargs.get('debug', False)
+
+    @torch.no_grad()
+    def init_memory(self, feature, label):
+        self.initialized = True
+        self.label_bank.copy_(torch.from_numpy(label).long().cuda())
+        # make sure no empty clusters
+        assert (np.bincount(label, minlength=self.num_classes) != 0).all()
+        feature /= (np.linalg.norm(feature, axis=1).reshape(-1, 1) + 1e-10)
+        self.feature_bank.copy_(torch.from_numpy(feature))
+        self._compute_centroids()
+
+    @torch.no_grad()
+    def _compute_centroids_ind(self, cinds):
+        '''compute a few centroids'''
+        for i, c in enumerate(cinds):
+            ind = torch.where(self.label_bank == c)[0]
+            self.centroids[i, :] = self.feature_bank[ind, :].mean(dim=0)
+
+    def _compute_centroids(self):
+        if self.debug:
+            print("enter: _compute_centroids")
+        '''compute all non-empty centroids'''
+        l = self.label_bank.cpu().numpy()
+        argl = np.argsort(l)
+        sortl = l[argl]
+        diff_pos = np.where(sortl[1:] - sortl[:-1] != 0)[0] + 1
+        start = np.insert(diff_pos, 0, 0)
+        end = np.insert(diff_pos, len(diff_pos), len(l))
+        class_start = sortl[start]
+        # keep empty class centroids unchanged
+        for i, st, ed in zip(class_start, start, end):
+            self.centroids[i, :] = self.feature_bank[argl[st:ed], :].mean(
+                dim=0)
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        if self.debug:
+            print("enter: _gather")
+        assert ind.size(0) > 0
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
+
+    def update_samples_memory(self, ind, feature):  # ind, feature: cuda tensor
+        if self.debug:
+            print("enter: update_samples_memory")
+        assert self.initialized
+        feature_norm = feature / (feature.norm(dim=1).view(-1, 1) + 1e-10
+                                  )  # normalize
+        ind, feature_norm = self._gather(
+            ind, feature_norm)  # ind: (N*w), feature: (N*w)xk, cuda tensor
+        # momentum update
+        feature_old = self.feature_bank[ind, ...]
+        feature_new = (1 - self.momentum) * feature_old + \
+            self.momentum * feature_norm
+        feature_norm = feature_new / (
+            feature_new.norm(dim=1).view(-1, 1) + 1e-10)
+        self.feature_bank[ind, ...] = feature_norm
+        # compute new labels
+        similarity_to_centroids = torch.mm(self.centroids,
+                                           feature_norm.permute(1, 0))  # CxN
+        newlabel = similarity_to_centroids.argmax(dim=0)  # cuda tensor
+        change_ratio = (newlabel !=
+            self.label_bank[ind]).sum().float() \
+            / float(newlabel.shape[0])
+        self.label_bank[ind] = newlabel.clone()  # copy to cpu
+        return change_ratio
+
+    @torch.no_grad()
+    def deal_with_small_clusters(self):
+        if self.debug:
+            print("enter: deal_with_small_clusters")
+        # check empty class
+        hist = torch.bincount(self.label_bank, minlength=self.num_classes)
+        small_clusters = torch.where(hist < self.min_cluster)[0]
+        if self.debug and self.rank == 0:
+            print("mincluster: {}, num of small class: {}".format(
+                hist.min(), len(small_clusters)))
+        if len(small_clusters) == 0:
+            return
+        # re-assign samples in small clusters to make them empty
+        for s in small_clusters:
+            ind = torch.where(self.label_bank == s)[0]
+            if len(ind) > 0:
+                inclusion = torch.from_numpy(
+                    np.setdiff1d(
+                        np.arange(self.num_classes),
+                        small_clusters.cpu().numpy(),
+                        assume_unique=True)).cuda()
+                target_ind = torch.mm(self.centroids[inclusion, :],
+                                      self.feature_bank[ind, :].permute(
+                                          1, 0)).argmax(dim=0)
+                target = inclusion[target_ind]
+                self.label_bank[ind] = target
+        # deal with empty cluster
+        self._redirect_empty_clusters(small_clusters)
+
+    def update_centroids_memory(self, cinds=None):
+        if cinds is None:
+            self._compute_centroids()
+        else:
+            self._compute_centroids_ind(cinds)
+
+    def _partition_max_cluster(self, max_cluster):
+        if self.debug:
+            print("enter: _partition_max_cluster")
+        assert self.rank == 0  # avoid randomness among ranks
+        max_cluster_inds = torch.where(self.label_bank == max_cluster)[0]
+        size = len(max_cluster_inds)
+
+        assert size >= 2  # image indices in the max cluster
+        max_cluster_features = self.feature_bank[max_cluster_inds, :]
+        if torch.any(torch.isnan(max_cluster_features)):
+            raise Exception("Has nan in features.")
+        kmeans_ret = self.kmeans.fit(max_cluster_features.cpu().numpy())
+        kmeans_labels = torch.from_numpy(kmeans_ret.labels_).cuda()
+        sub_cluster1_ind = max_cluster_inds[kmeans_labels == 0]
+        sub_cluster2_ind = max_cluster_inds[kmeans_labels == 1]
+        if not (len(sub_cluster1_ind) > 0 and len(sub_cluster2_ind) > 0):
+            print(
+                "Warning: kmeans partition fails, resort to random partition.")
+            rnd_idx = torch.randperm(size)
+            sub_cluster1_ind = max_cluster_inds[rnd_idx[:size // 2]]
+            sub_cluster2_ind = max_cluster_inds[rnd_idx[size // 2:]]
+        return sub_cluster1_ind, sub_cluster2_ind
+
+    def _redirect_empty_clusters(self, empty_clusters):
+        if self.debug:
+            print("enter: _redirect_empty_clusters")
+        for e in empty_clusters:
+            assert (self.label_bank != e).all().item(), \
+                "Cluster #{} is not an empty cluster.".format(e)
+            max_cluster = torch.bincount(
+                self.label_bank, minlength=self.num_classes).argmax().item()
+            # gather partitioning indices
+            if self.rank == 0:
+                sub_cluster1_ind, sub_cluster2_ind = self._partition_max_cluster(
+                    max_cluster)
+                size2 = torch.LongTensor([len(sub_cluster2_ind)]).cuda()
+            else:
+                size2 = torch.LongTensor([0]).cuda()
+            dist.all_reduce(size2)
+            if self.rank != 0:
+                sub_cluster2_ind = torch.zeros((size2, ),
+                                               dtype=torch.int64).cuda()
+            dist.broadcast(sub_cluster2_ind, 0)
+
+            # reassign samples in partition #2 to the empty class
+            self.label_bank[sub_cluster2_ind] = e
+            # update centroids of max_cluster and e
+            self.update_centroids_memory([max_cluster, e])
diff --git a/openselfsup/models/memories/simple_memory.py b/openselfsup/models/memories/simple_memory.py
new file mode 100644
index 00000000..5ee7775d
--- /dev/null
+++ b/openselfsup/models/memories/simple_memory.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from openselfsup.utils import AliasMethod
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class SimpleMemory(nn.Module):
+
+    def __init__(self, length, feat_dim, momentum, **kwargs):
+        super(SimpleMemory, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        self.feature_bank = torch.randn(length, feat_dim).cuda()
+        self.feature_bank = nn.functional.normalize(self.feature_bank)
+        self.momentum = momentum
+        self.multinomial = AliasMethod(torch.ones(length))
+        self.multinomial.cuda()
+
+    def update(self, ind, feature):
+        feature_norm = nn.functional.normalize(feature)
+        ind, feature_norm = self._gather(ind, feature_norm)
+        feature_old = self.feature_bank[ind, ...]
+        feature_new = (1 - self.momentum) * feature_old + \
+            self.momentum * feature_norm
+        feature_new_norm = nn.functional.normalize(feature_new)
+        self.feature_bank[ind, ...] = feature_new_norm
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
diff --git a/openselfsup/models/moco.py b/openselfsup/models/moco.py
new file mode 100644
index 00000000..c4ff19a7
--- /dev/null
+++ b/openselfsup/models/moco.py
@@ -0,0 +1,189 @@
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+
+
+@MODELS.register_module
+class MOCO(nn.Module):
+    '''MOCO.
+    Part of the code is borrowed from:
+        "https://github.com/facebookresearch/moco/blob/master/moco/builder.py".
+    '''
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 head=None,
+                 pretrained=None,
+                 queue_len=65536,
+                 feat_dim=128,
+                 momentum=0.999,
+                 **kwargs):
+        super(MOCO, self).__init__()
+        self.encoder_q = nn.Sequential(
+            builder.build_backbone(backbone), builder.build_neck(neck))
+        self.encoder_k = nn.Sequential(
+            builder.build_backbone(backbone), builder.build_neck(neck))
+        self.backbone = self.encoder_q[0]
+        for param in self.encoder_k.parameters():
+            param.requires_grad = False
+        self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+        self.queue_len = queue_len
+        self.momentum = momentum
+
+        # create the queue
+        self.register_buffer("queue", torch.randn(feat_dim, queue_len))
+        self.queue = nn.functional.normalize(self.queue, dim=0)
+        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.encoder_q[0].init_weights(pretrained=pretrained)
+        self.encoder_q[1].init_weights(init_linear='kaiming')
+        for param_q, param_k in zip(self.encoder_q.parameters(),
+                                    self.encoder_k.parameters()):
+            param_k.data.copy_(param_q.data)
+
+    @torch.no_grad()
+    def _momentum_update_key_encoder(self):
+        """
+        Momentum update of the key encoder
+        """
+        for param_q, param_k in zip(self.encoder_q.parameters(),
+                                    self.encoder_k.parameters()):
+            param_k.data = param_k.data * self.momentum + \
+                           param_q.data * (1. - self.momentum)
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, keys):
+        # gather keys before updating queue
+        keys = concat_all_gather(keys)
+
+        batch_size = keys.shape[0]
+
+        ptr = int(self.queue_ptr)
+        assert self.queue_len % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.queue[:, ptr:ptr + batch_size] = keys.transpose(0, 1)
+        ptr = (ptr + batch_size) % self.queue_len  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    @torch.no_grad()
+    def _batch_shuffle_ddp(self, x):
+        """
+        Batch shuffle, for making use of BatchNorm.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # random shuffle index
+        idx_shuffle = torch.randperm(batch_size_all).cuda()
+
+        # broadcast to all gpus
+        torch.distributed.broadcast(idx_shuffle, src=0)
+
+        # index for restoring
+        idx_unshuffle = torch.argsort(idx_shuffle)
+
+        # shuffled index for this gpu
+        gpu_idx = torch.distributed.get_rank()
+        idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]
+
+        return x_gather[idx_this], idx_unshuffle
+
+    @torch.no_grad()
+    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
+        """
+        Undo batch shuffle.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # restored index for this gpu
+        gpu_idx = torch.distributed.get_rank()
+        idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]
+
+        return x_gather[idx_this]
+
+    def forward_train(self, img, **kwargs):
+        assert img.dim() == 5, \
+            "Input must have 5 dims, got: {}".format(img.dim())
+        im_q = img[:, 0, ...].contiguous()
+        im_k = img[:, 1, ...].contiguous()
+        # compute query features
+        q = self.encoder_q(im_q)[0]  # queries: NxC
+        q = nn.functional.normalize(q, dim=1)
+
+        # compute key features
+        with torch.no_grad():  # no gradient to keys
+            self._momentum_update_key_encoder()  # update the key encoder
+
+            # shuffle for making use of BN
+            im_k, idx_unshuffle = self._batch_shuffle_ddp(im_k)
+
+            k = self.encoder_k(im_k)[0]  # keys: NxC
+            k = nn.functional.normalize(k, dim=1)
+
+            # undo shuffle
+            k = self._batch_unshuffle_ddp(k, idx_unshuffle)
+
+        # compute logits
+        # Einstein sum is more intuitive
+        # positive logits: Nx1
+        l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
+        # negative logits: NxK
+        l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])
+
+        losses = self.head(l_pos, l_neg)
+        self._dequeue_and_enqueue(k)
+
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        pass
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.encoder_q[0](img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+
+# utils
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+    output = torch.cat(tensors_gather, dim=0)
+    return output
diff --git a/openselfsup/models/necks.py b/openselfsup/models/necks.py
new file mode 100644
index 00000000..7d509cad
--- /dev/null
+++ b/openselfsup/models/necks.py
@@ -0,0 +1,132 @@
+import torch.nn as nn
+from mmcv.cnn import kaiming_init, normal_init
+
+from .registry import NECKS
+
+
+@NECKS.register_module
+class LinearNeck(nn.Module):
+
+    def __init__(self, in_channels, out_channels, with_avg_pool=True):
+        super(LinearNeck, self).__init__()
+        self.with_avg_pool = with_avg_pool
+        if with_avg_pool:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_channels, out_channels)
+
+    def init_weights(self, init_linear='normal'):
+        assert init_linear in ['normal', 'kaiming'], \
+            "Undefined init_linear: {}".format(init_linear)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                if init_linear == 'normal':
+                    normal_init(m, std=0.01)
+                else:
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert len(x) == 1
+        if self.with_avg_pool:
+            x = self.avgpool(x[0])
+        return [self.fc(x.view(x.size(0), -1))]
+
+
+@NECKS.register_module
+class NonLinearNeckV0(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 hid_channels,
+                 out_channels,
+                 with_avg_pool=True):
+        super(NonLinearNeckV0, self).__init__()
+        self.with_avg_pool = with_avg_pool
+        if with_avg_pool:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hid_channels),
+            nn.BatchNorm1d(hid_channels, momentum=0.001, affine=False),
+            nn.ReLU(inplace=True), nn.Dropout(),
+            nn.Linear(hid_channels, out_channels), nn.ReLU(inplace=True))
+
+    def init_weights(self, init_linear='normal'):
+        assert init_linear in ['normal', 'kaiming'], \
+            "Undefined init_linear: {}".format(init_linear)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                if init_linear == 'normal':
+                    normal_init(m, std=0.01)
+                else:
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm1d, nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert len(x) == 1
+        if self.with_avg_pool:
+            x = self.avgpool(x[0])
+        return [self.mlp(x.view(x.size(0), -1))]
+
+
+@NECKS.register_module
+class NonLinearNeckV1(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 hid_channels,
+                 out_channels,
+                 with_avg_pool=True):
+        super(NonLinearNeckV1, self).__init__()
+        self.with_avg_pool = with_avg_pool
+        if with_avg_pool:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hid_channels), nn.ReLU(inplace=True),
+            nn.Linear(hid_channels, out_channels))
+
+    def init_weights(self, init_linear='normal'):
+        assert init_linear in ['normal', 'kaiming'], \
+            "Undefined init_linear: {}".format(init_linear)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                if init_linear == 'normal':
+                    normal_init(m, std=0.01)
+                else:
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert len(x) == 1
+        if self.with_avg_pool:
+            x = self.avgpool(x[0])
+        return [self.mlp(x.view(x.size(0), -1))]
+
+
+@NECKS.register_module
+class AvgPoolNeck(nn.Module):
+
+    def __init__(self):
+        super(AvgPoolNeck, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def init_weights(self, **kwargs):
+        pass
+
+    def forward(self, x):
+        assert len(x) == 1
+        return [self.avg_pool(x[0])]
diff --git a/openselfsup/models/npid.py b/openselfsup/models/npid.py
new file mode 100644
index 00000000..34e59ab5
--- /dev/null
+++ b/openselfsup/models/npid.py
@@ -0,0 +1,100 @@
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+
+
+@MODELS.register_module
+class NPID(nn.Module):
+    '''Model of "Unsupervised Feature Learning via Non-parametric
+       Instance Discrimination".
+    Arguments:
+        neg_num (int): number of negative samples for each image
+        ensure_neg (bool): if False, there is a small probability
+            that negative samples contain positive ones.
+    '''
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 head=None,
+                 memory_bank=None,
+                 neg_num=65536,
+                 ensure_neg=False,
+                 pretrained=None):
+        super(NPID, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck)
+        self.head = builder.build_head(head)
+        self.memory_bank = builder.build_memory(memory_bank)
+        self.init_weights(pretrained=pretrained)
+
+        self.neg_num = neg_num
+        self.ensure_neg = ensure_neg
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.neck.init_weights(init_linear='kaiming')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+
+        Returns:
+            x (tuple): backbone outputs
+        """
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, idx, **kwargs):
+        x = self.forward_backbone(img)
+        idx = idx.cuda()
+        feature = self.neck(x)[0]
+        feature = nn.functional.normalize(feature)  # BxC
+        bs, feat_dim = feature.shape[:2]
+        neg_idx = self.memory_bank.multinomial.draw(bs * self.neg_num)
+        if self.ensure_neg:
+            neg_idx = neg_idx.view(bs, -1)
+            while True:
+                wrong = (neg_idx == idx.view(-1, 1))
+                if wrong.sum().item() > 0:
+                    neg_idx[wrong] = self.memory_bank.multinomial.draw(
+                        wrong.sum().item())
+                else:
+                    break
+            neg_idx = neg_idx.flatten()
+
+        pos_feat = torch.index_select(self.memory_bank.feature_bank, 0,
+                                      idx)  # BXC
+        neg_feat = torch.index_select(self.memory_bank.feature_bank, 0,
+                                      neg_idx).view(bs, self.neg_num,
+                                                    feat_dim)  # BxKxC
+
+        pos_logits = torch.einsum('nc,nc->n',
+                                  [pos_feat, feature]).unsqueeze(-1)
+        neg_logits = torch.bmm(neg_feat, feature.unsqueeze(2)).squeeze(2)
+
+        losses = self.head(pos_logits, neg_logits)
+
+        # update memory bank
+        with torch.no_grad():
+            self.memory_bank.update(idx, feature.detach())
+
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        pass
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
diff --git a/openselfsup/models/odc.py b/openselfsup/models/odc.py
new file mode 100644
index 00000000..f9cd2fce
--- /dev/null
+++ b/openselfsup/models/odc.py
@@ -0,0 +1,103 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+from . import builder
+from .registry import MODELS
+from .utils import Sobel
+
+
+@MODELS.register_module
+class ODC(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 with_sobel=False,
+                 neck=None,
+                 head=None,
+                 memory_bank=None,
+                 pretrained=None):
+        super(ODC, self).__init__()
+        self.with_sobel = with_sobel
+        if with_sobel:
+            self.sobel_layer = Sobel()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck)
+        if head is not None:
+            self.head = builder.build_head(head)
+        if memory_bank is not None:
+            self.memory_bank = builder.build_memory(memory_bank)
+        self.init_weights(pretrained=pretrained)
+
+        # set reweight tensors
+        self.num_classes = head.num_classes
+        self.loss_weight = torch.ones((self.num_classes, ),
+                                      dtype=torch.float32).cuda()
+        self.loss_weight /= self.loss_weight.sum()
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.neck.init_weights(init_linear='kaiming')
+        self.head.init_weights(init_linear='normal')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+    
+        Returns:
+            x (tuple): backbone outputs
+        """
+        if self.with_sobel:
+            img = self.sobel_layer(img)
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, idx, **kwargs):
+        # forward & backward
+        x = self.forward_backbone(img)
+        feature = self.neck(x)
+        outs = self.head(feature)
+        if self.memory_bank.label_bank.is_cuda:
+            loss_inputs = (outs, self.memory_bank.label_bank[idx])
+        else:
+            loss_inputs = (outs, self.memory_bank.label_bank[idx.cpu()].cuda())
+        losses = self.head.loss(*loss_inputs)
+
+        # update samples memory
+        change_ratio = self.memory_bank.update_samples_memory(
+            idx, feature[0].detach())
+        losses['change_ratio'] = change_ratio
+
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+    def set_reweight(self, labels=None, reweight_pow=0.5):
+        if labels is None:
+            if self.memory_bank.label_bank.is_cuda:
+                labels = self.memory_bank.label_bank.cpu().numpy()
+            else:
+                labels = self.memory_bank.label_bank.numpy()
+        hist = np.bincount(
+            labels, minlength=self.num_classes).astype(np.float32)
+        inv_hist = (1. / (hist + 1e-5))**reweight_pow
+        weight = inv_hist / inv_hist.sum()
+        self.loss_weight.copy_(torch.from_numpy(weight))
+        self.head.criterion = nn.CrossEntropyLoss(weight=self.loss_weight)
diff --git a/openselfsup/models/registry.py b/openselfsup/models/registry.py
new file mode 100644
index 00000000..8e1611ee
--- /dev/null
+++ b/openselfsup/models/registry.py
@@ -0,0 +1,8 @@
+from openselfsup.utils import Registry
+
+MODELS = Registry('model')
+BACKBONES = Registry('backbone')
+NECKS = Registry('neck')
+HEADS = Registry('head')
+MEMORIES = Registry('memory')
+LOSSES = Registry('loss')
diff --git a/openselfsup/models/rotation_pred.py b/openselfsup/models/rotation_pred.py
new file mode 100644
index 00000000..87c8a3b4
--- /dev/null
+++ b/openselfsup/models/rotation_pred.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+
+
+@MODELS.register_module
+class RotationPred(nn.Module):
+
+    def __init__(self, backbone, head=None, pretrained=None):
+        super(RotationPred, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        if head is not None:
+            self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.head.init_weights(init_linear='kaiming')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+
+        Returns:
+            x (tuple): backbone outputs
+        """
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, rot_label, **kwargs):
+        x = self.forward_backbone(img)
+        outs = self.head(x)
+        loss_inputs = (outs, rot_label)
+        losses = self.head.loss(*loss_inputs)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def forward(self, img, rot_label=None, mode='train', **kwargs):
+        if mode != "extract" and img.dim() == 5:
+            assert rot_label.dim() == 2
+            img = img.view(
+                img.size(0) * img.size(1), img.size(2), img.size(3),
+                img.size(4))
+            rot_label = torch.flatten(rot_label)
+        if mode == 'train':
+            return self.forward_train(img, rot_label, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
diff --git a/openselfsup/models/simclr.py b/openselfsup/models/simclr.py
new file mode 100644
index 00000000..16ece396
--- /dev/null
+++ b/openselfsup/models/simclr.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+from .utils import GatherLayer
+import pdb
+
+
+@MODELS.register_module
+class SimCLR(nn.Module):
+
+    def __init__(self, backbone, neck=None, head=None, pretrained=None):
+        super(SimCLR, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck)
+        self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+    @staticmethod
+    def _create_buffer(N):
+        mask = 1 - torch.eye(N * 2, dtype=torch.uint8).cuda()
+        pos_ind = (torch.arange(N * 2).cuda(),
+                   2 * torch.arange(N, dtype=torch.long).unsqueeze(1).repeat(
+                       1, 2).view(-1, 1).squeeze().cuda())
+        neg_mask = torch.ones((N * 2, N * 2 - 1), dtype=torch.uint8).cuda()
+        neg_mask[pos_ind] = 0
+        return mask, pos_ind, neg_mask
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.neck.init_weights(init_linear='kaiming')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+
+        Returns:
+            x (tuple): backbone outputs
+        """
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, **kwargs):
+        assert img.dim() == 5, \
+            "Input must have 5 dims, got: {}".format(img.dim())
+        img = img.reshape(
+            img.size(0) * 2, img.size(2), img.size(3), img.size(4))
+        x = self.forward_backbone(img)  # 2n
+        z = self.neck(x)[0]  # (2n)xd
+        z = z / (torch.norm(z, p=2, dim=1, keepdim=True) + 1e-10)
+        z = torch.cat(GatherLayer.apply(z), dim=0)  # (2N)xd
+        assert z.size(0) % 2 == 0
+        N = z.size(0) // 2
+        s = torch.matmul(z, z.permute(1, 0))  # (2N)x(2N)
+        mask, pos_ind, neg_mask = self._create_buffer(N)
+        # remove diagonal, (2N)x(2N-1)
+        s = torch.masked_select(s, mask).reshape(s.size(0), -1)
+        positive = s[pos_ind].unsqueeze(1)  # (2N)x1
+        # select negative, (2N)x(2N-2)
+        negative = torch.masked_select(s, neg_mask).reshape(s.size(0), -1)
+        losses = self.head(positive, negative)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        pass
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
diff --git a/openselfsup/models/utils/__init__.py b/openselfsup/models/utils/__init__.py
new file mode 100644
index 00000000..a5d1f553
--- /dev/null
+++ b/openselfsup/models/utils/__init__.py
@@ -0,0 +1,16 @@
+from .accuracy import Accuracy, accuracy
+from .conv_module import ConvModule, build_conv_layer
+from .conv_ws import ConvWS2d, conv_ws_2d
+from .gather_layer import GatherLayer
+from .multi_pooling import MultiPooling
+from .norm import build_norm_layer
+from .scale import Scale
+#from .weight_init import (bias_init_with_prob, kaiming_init, normal_init,
+#                          uniform_init, xavier_init)
+from .sobel import Sobel
+
+#__all__ = [
+#    'conv_ws_2d', 'ConvWS2d', 'build_conv_layer', 'ConvModule',
+#    'build_norm_layer', 'xavier_init', 'normal_init', 'uniform_init',
+#    'kaiming_init', 'bias_init_with_prob', 'Scale', 'Sobel'
+#]
diff --git a/openselfsup/models/utils/accuracy.py b/openselfsup/models/utils/accuracy.py
new file mode 100644
index 00000000..20d0ad8c
--- /dev/null
+++ b/openselfsup/models/utils/accuracy.py
@@ -0,0 +1,31 @@
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1):
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    _, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, )):
+        super().__init__()
+        self.topk = topk
+
+    def forward(self, pred, target):
+        return accuracy(pred, target, self.topk)
diff --git a/openselfsup/models/utils/conv_module.py b/openselfsup/models/utils/conv_module.py
new file mode 100644
index 00000000..2ea56d87
--- /dev/null
+++ b/openselfsup/models/utils/conv_module.py
@@ -0,0 +1,163 @@
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import constant_init, kaiming_init
+
+from .conv_ws import ConvWS2d
+from .norm import build_norm_layer
+
+conv_cfg = {
+    'Conv': nn.Conv2d,
+    'ConvWS': ConvWS2d,
+}
+
+
+def build_conv_layer(cfg, *args, **kwargs):
+    """ Build convolution layer
+
+    Args:
+        cfg (None or dict): cfg should contain:
+            type (str): identify conv layer type.
+            layer args: args needed to instantiate a conv layer.
+
+    Returns:
+        layer (nn.Module): created conv layer
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv')
+    else:
+        assert isinstance(cfg, dict) and 'type' in cfg
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in conv_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    else:
+        conv_layer = conv_cfg[layer_type]
+
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
+
+
+class ConvModule(nn.Module):
+    """A conv block that contains conv/norm/activation layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+        conv_cfg (dict): Config dict for convolution layer.
+        norm_cfg (dict): Config dict for normalization layer.
+        activation (str or None): Activation type, "ReLU" by default.
+        inplace (bool): Whether to use inplace mode for activation.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias='auto',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 activation='relu',
+                 inplace=True,
+                 order=('conv', 'norm', 'act')):
+        super(ConvModule, self).__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.activation = activation
+        self.inplace = inplace
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == set(['conv', 'norm', 'act'])
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = activation is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = False if self.with_norm else True
+        self.with_bias = bias
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.add_module(self.norm_name, norm)
+
+        # build activation layer
+        if self.with_activation:
+            # TODO: introduce `act_cfg` and supports more activation layers
+            if self.activation not in ['relu']:
+                raise ValueError('{} is currently not supported.'.format(
+                    self.activation))
+            if self.activation == 'relu':
+                self.activate = nn.ReLU(inplace=inplace)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        return getattr(self, self.norm_name)
+
+    def init_weights(self):
+        nonlinearity = 'relu' if self.activation is None else self.activation
+        kaiming_init(self.conv, mode='fan_in', nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self, x, activate=True, norm=True):
+        for layer in self.order:
+            if layer == 'conv':
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+        return x
diff --git a/openselfsup/models/utils/conv_ws.py b/openselfsup/models/utils/conv_ws.py
new file mode 100644
index 00000000..5ccd735f
--- /dev/null
+++ b/openselfsup/models/utils/conv_ws.py
@@ -0,0 +1,46 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv_ws_2d(input,
+               weight,
+               bias=None,
+               stride=1,
+               padding=0,
+               dilation=1,
+               groups=1,
+               eps=1e-5):
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+class ConvWS2d(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 eps=1e-5):
+        super(ConvWS2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+
+    def forward(self, x):
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
diff --git a/openselfsup/models/utils/gather_layer.py b/openselfsup/models/utils/gather_layer.py
new file mode 100644
index 00000000..8b73708e
--- /dev/null
+++ b/openselfsup/models/utils/gather_layer.py
@@ -0,0 +1,22 @@
+import torch
+import torch.distributed as dist
+
+
+class GatherLayer(torch.autograd.Function):
+    '''Gather tensors from all process, supporting backward propagation.
+    '''
+
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        output = [torch.zeros_like(input) \
+            for _ in range(dist.get_world_size())]
+        dist.all_gather(output, input)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        input, = ctx.saved_tensors
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[dist.get_rank()]
+        return grad_out
diff --git a/openselfsup/models/utils/multi_pooling.py b/openselfsup/models/utils/multi_pooling.py
new file mode 100644
index 00000000..1440be03
--- /dev/null
+++ b/openselfsup/models/utils/multi_pooling.py
@@ -0,0 +1,38 @@
+import torch.nn as nn
+
+
+class MultiPooling(nn.Module):
+    """Pooling layers for features from multiple depth.
+    """
+    POOL_PARAMS = {
+        'resnet50': [
+            dict(kernel_size=10, stride=10, padding=4),
+            dict(kernel_size=16, stride=8, padding=0),
+            dict(kernel_size=13, stride=5, padding=0),
+            dict(kernel_size=8, stride=3, padding=0),
+            dict(kernel_size=6, stride=1, padding=0)
+        ]
+    }
+    POOL_SIZES = {'resnet50': [12, 6, 4, 3, 2]}
+    POOL_DIMS = {'resnet50': [9216, 9216, 8192, 9216, 8192]}
+
+    def __init__(self,
+                 pool_type='adaptive',
+                 in_indices=(0, ),
+                 backbone='resnet50'):
+        super(MultiPooling, self).__init__()
+        assert pool_type in ['adaptive', 'specified']
+        if pool_type == 'adaptive':
+            self.pools = nn.ModuleList([
+                nn.AdaptiveAvgPool2d(self.POOL_SIZES[backbone][l])
+                for l in in_indices
+            ])
+        else:
+            self.pools = nn.ModuleList([
+                nn.AvgPool2d(**self.POOL_PARAMS[backbone][l])
+                for l in in_indices
+            ])
+
+    def forward(self, x):
+        assert isinstance(x, (list, tuple))
+        return [p(xx) for p, xx in zip(self.pools, x)]
diff --git a/openselfsup/models/utils/norm.py b/openselfsup/models/utils/norm.py
new file mode 100644
index 00000000..d5687cbd
--- /dev/null
+++ b/openselfsup/models/utils/norm.py
@@ -0,0 +1,55 @@
+import torch.nn as nn
+
+norm_cfg = {
+    # format: layer_type: (abbreviation, module)
+    'BN': ('bn', nn.BatchNorm2d),
+    'SyncBN': ('bn', nn.SyncBatchNorm),
+    'GN': ('gn', nn.GroupNorm),
+    # and potentially 'SN'
+}
+
+
+def build_norm_layer(cfg, num_features, postfix=''):
+    """ Build normalization layer
+
+    Args:
+        cfg (dict): cfg should contain:
+            type (str): identify norm layer type.
+            layer args: args needed to instantiate a norm layer.
+            requires_grad (bool): [optional] whether stop gradient updates
+        num_features (int): number of channels from input.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer.
+
+    Returns:
+        name (str): abbreviation + postfix
+        layer (nn.Module): created norm layer
+    """
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in norm_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    else:
+        abbr, norm_layer = norm_cfg[layer_type]
+        if norm_layer is None:
+            raise NotImplementedError
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN':
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
diff --git a/openselfsup/models/utils/scale.py b/openselfsup/models/utils/scale.py
new file mode 100644
index 00000000..2461af8a
--- /dev/null
+++ b/openselfsup/models/utils/scale.py
@@ -0,0 +1,15 @@
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """
+    A learnable scale parameter
+    """
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
diff --git a/openselfsup/models/utils/sobel.py b/openselfsup/models/utils/sobel.py
new file mode 100644
index 00000000..73ef30e7
--- /dev/null
+++ b/openselfsup/models/utils/sobel.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+
+class Sobel(nn.Module):
+
+    def __init__(self):
+        super(Sobel, self).__init__()
+        grayscale = nn.Conv2d(3, 1, kernel_size=1, stride=1, padding=0)
+        grayscale.weight.data.fill_(1.0 / 3.0)
+        grayscale.bias.data.zero_()
+        sobel_filter = nn.Conv2d(1, 2, kernel_size=3, stride=1, padding=1)
+        sobel_filter.weight.data[0, 0].copy_(
+            torch.FloatTensor([[1, 0, -1], [2, 0, -2], [1, 0, -1]]))
+        sobel_filter.weight.data[1, 0].copy_(
+            torch.FloatTensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]))
+        sobel_filter.bias.data.zero_()
+        self.sobel = nn.Sequential(grayscale, sobel_filter)
+        for p in self.sobel.parameters():
+            p.requires_grad = False
+
+    def forward(self, x):
+        return self.sobel(x)
diff --git a/openselfsup/third_party/clustering.py b/openselfsup/third_party/clustering.py
new file mode 100644
index 00000000..d84459b0
--- /dev/null
+++ b/openselfsup/third_party/clustering.py
@@ -0,0 +1,308 @@
+# This file is modified from
+# https://github.com/facebookresearch/deepcluster/blob/master/clustering.py
+
+import time
+import numpy as np
+import faiss
+import torch
+
+__all__ = ['Kmeans', 'PIC']
+
+
+def preprocess_features(npdata, pca):
+    """Preprocess an array of features.
+    Args:
+        npdata (np.array N * ndim): features to preprocess
+        pca (int): dim of output
+    Returns:
+        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
+    """
+    _, ndim = npdata.shape
+    #npdata =  npdata.astype('float32')
+    assert npdata.dtype == np.float32
+
+    if np.any(np.isnan(npdata)):
+        raise Exception("nan occurs")
+    if pca != -1:
+        print("\nPCA from dim {} to dim {}".format(ndim, pca))
+        mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
+        mat.train(npdata)
+        assert mat.is_trained
+        npdata = mat.apply_py(npdata)
+    if np.any(np.isnan(npdata)):
+        percent = np.isnan(npdata).sum().item() / float(np.size(npdata)) * 100
+        if percent > 0.1:
+            raise Exception(
+                "More than 0.1% nan occurs after pca, percent: {}%".format(
+                    percent))
+        else:
+            npdata[np.isnan(npdata)] = 0.
+    # L2 normalization
+    row_sums = np.linalg.norm(npdata, axis=1)
+
+    npdata = npdata / (row_sums[:, np.newaxis] + 1e-10)
+
+    return npdata
+
+
+def make_graph(xb, nnn):
+    """Builds a graph of nearest neighbors.
+    Args:
+        xb (np.array): data
+        nnn (int): number of nearest neighbors
+    Returns:
+        list: for each data the list of ids to its nnn nearest neighbors
+        list: for each data the list of distances to its nnn NN
+    """
+    N, dim = xb.shape
+
+    # we need only a StandardGpuResources per GPU
+    res = faiss.StandardGpuResources()
+
+    # L2
+    flat_config = faiss.GpuIndexFlatConfig()
+    flat_config.device = int(torch.cuda.device_count()) - 1
+    index = faiss.GpuIndexFlatL2(res, dim, flat_config)
+    index.add(xb)
+    D, I = index.search(xb, nnn + 1)
+    return I, D
+
+
+def run_kmeans(x, nmb_clusters, verbose=False):
+    """Runs kmeans on 1 GPU.
+    Args:
+        x: data
+        nmb_clusters (int): number of clusters
+    Returns:
+        list: ids of data in each cluster
+    """
+    n_data, d = x.shape
+
+    # faiss implementation of k-means
+    clus = faiss.Clustering(d, nmb_clusters)
+
+    # Change faiss seed at each k-means so that the randomly picked
+    # initialization centroids do not correspond to the same feature ids
+    # from an epoch to another.
+    clus.seed = np.random.randint(1234)
+
+    clus.niter = 20
+    clus.max_points_per_centroid = 10000000
+    res = faiss.StandardGpuResources()
+    flat_config = faiss.GpuIndexFlatConfig()
+    flat_config.useFloat16 = False
+    flat_config.device = 0
+    index = faiss.GpuIndexFlatL2(res, d, flat_config)
+
+    # perform the training
+    clus.train(x, index)
+    _, I = index.search(x, 1)
+    losses = faiss.vector_to_array(clus.obj)
+    if verbose:
+        print('k-means loss evolution: {0}'.format(losses))
+
+    return [int(n[0]) for n in I], losses[-1]
+
+
+def arrange_clustering(images_lists):
+    pseudolabels = []
+    image_indexes = []
+    for cluster, images in enumerate(images_lists):
+        image_indexes.extend(images)
+        pseudolabels.extend([cluster] * len(images))
+    indexes = np.argsort(image_indexes)
+    return np.asarray(pseudolabels)[indexes]
+
+
+class Kmeans:
+
+    def __init__(self, k, pca_dim=256):
+        self.k = k
+        self.pca_dim = pca_dim
+
+    def cluster(self, feat, verbose=False):
+        """Performs k-means clustering.
+            Args:
+                x_data (np.array N * dim): data to cluster
+        """
+        end = time.time()
+
+        # PCA-reducing, whitening and L2-normalization
+        xb = preprocess_features(feat, self.pca_dim)
+
+        # cluster the data
+        I, loss = run_kmeans(xb, self.k, verbose)
+        self.labels = np.array(I)
+        if verbose:
+            print('k-means time: {0:.0f} s'.format(time.time() - end))
+
+        return loss
+
+
+def make_adjacencyW(I, D, sigma):
+    """Create adjacency matrix with a Gaussian kernel.
+    Args:
+        I (numpy array): for each vertex the ids to its nnn linked vertices
+                  + first column of identity.
+        D (numpy array): for each data the l2 distances to its nnn linked vertices
+                  + first column of zeros.
+        sigma (float): Bandwith of the Gaussian kernel.
+
+    Returns:
+        csr_matrix: affinity matrix of the graph.
+    """
+    V, k = I.shape
+    k = k - 1
+    indices = np.reshape(np.delete(I, 0, 1), (1, -1))
+    indptr = np.multiply(k, np.arange(V + 1))
+
+    def exp_ker(d):
+        return np.exp(-d / sigma**2)
+
+    exp_ker = np.vectorize(exp_ker)
+    res_D = exp_ker(D)
+    data = np.reshape(np.delete(res_D, 0, 1), (1, -1))
+    adj_matrix = csr_matrix((data[0], indices[0], indptr), shape=(V, V))
+    return adj_matrix
+
+
+def run_pic(I, D, sigma, alpha):
+    """Run PIC algorithm"""
+    a = make_adjacencyW(I, D, sigma)
+    graph = a + a.transpose()
+    cgraph = graph
+    nim = graph.shape[0]
+
+    W = graph
+    t0 = time.time()
+
+    v0 = np.ones(nim) / nim
+
+    # power iterations
+    v = v0.astype('float32')
+
+    t0 = time.time()
+    dt = 0
+    for i in range(200):
+        vnext = np.zeros(nim, dtype='float32')
+
+        vnext = vnext + W.transpose().dot(v)
+
+        vnext = alpha * vnext + (1 - alpha) / nim
+        # L1 normalize
+        vnext /= vnext.sum()
+        v = vnext
+
+        if (i == 200 - 1):
+            clust = find_maxima_cluster(W, v)
+
+    return [int(i) for i in clust]
+
+
+def find_maxima_cluster(W, v):
+    n, m = W.shape
+    assert (n == m)
+    assign = np.zeros(n)
+    # for each node
+    pointers = list(range(n))
+    for i in range(n):
+        best_vi = 0
+        l0 = W.indptr[i]
+        l1 = W.indptr[i + 1]
+        for l in range(l0, l1):
+            j = W.indices[l]
+            vi = W.data[l] * (v[j] - v[i])
+            if vi > best_vi:
+                best_vi = vi
+                pointers[i] = j
+    n_clus = 0
+    cluster_ids = -1 * np.ones(n)
+    for i in range(n):
+        if pointers[i] == i:
+            cluster_ids[i] = n_clus
+            n_clus = n_clus + 1
+    for i in range(n):
+        # go from pointers to pointers starting from i until reached a local optim
+        current_node = i
+        while pointers[current_node] != current_node:
+            current_node = pointers[current_node]
+
+        assign[i] = cluster_ids[current_node]
+        assert (assign[i] >= 0)
+    return assign
+
+
+class PIC():
+    """Class to perform Power Iteration Clustering on a graph of nearest neighbors.
+        Args:
+            args: for consistency with k-means init
+            sigma (float): bandwith of the Gaussian kernel (default 0.2)
+            nnn (int): number of nearest neighbors (default 5)
+            alpha (float): parameter in PIC (default 0.001)
+            distribute_singletons (bool): If True, reassign each singleton to
+                                      the cluster of its closest non
+                                      singleton nearest neighbors (up to nnn
+                                      nearest neighbors).
+        Attributes:
+            images_lists (list of list): for each cluster, the list of image indexes
+                                         belonging to this cluster
+    """
+
+    def __init__(self,
+                 args=None,
+                 sigma=0.2,
+                 nnn=5,
+                 alpha=0.001,
+                 distribute_singletons=True,
+                 pca_dim=256):
+        self.sigma = sigma
+        self.alpha = alpha
+        self.nnn = nnn
+        self.distribute_singletons = distribute_singletons
+        self.pca_dim = pca_dim
+
+    def cluster(self, data, verbose=False):
+        end = time.time()
+
+        # preprocess the data
+        xb = preprocess_features(data, self.pca_dim)
+
+        # construct nnn graph
+        I, D = make_graph(xb, self.nnn)
+
+        # run PIC
+        clust = run_pic(I, D, self.sigma, self.alpha)
+        images_lists = {}
+        for h in set(clust):
+            images_lists[h] = []
+        for data, c in enumerate(clust):
+            images_lists[c].append(data)
+
+        # allocate singletons to clusters of their closest NN not singleton
+        if self.distribute_singletons:
+            clust_NN = {}
+            for i in images_lists:
+                # if singleton
+                if len(images_lists[i]) == 1:
+                    s = images_lists[i][0]
+                    # for NN
+                    for n in I[s, 1:]:
+                        # if NN is not a singleton
+                        if not len(images_lists[clust[n]]) == 1:
+                            clust_NN[s] = n
+                            break
+            for s in clust_NN:
+                del images_lists[clust[s]]
+                clust[s] = clust[clust_NN[s]]
+                images_lists[clust[s]].append(s)
+
+        self.images_lists = []
+        self.labels = -1 * np.ones((data.shape[0], ), dtype=np.int)
+        for i, c in enumerate(images_lists):
+            self.images_lists.append(images_lists[c])
+            self.labels[images_lists[c]] = i
+        assert np.all(self.labels != -1)
+
+        if verbose:
+            print('pic time: {0:.0f} s'.format(time.time() - end))
+        return 0
diff --git a/openselfsup/utils/__init__.py b/openselfsup/utils/__init__.py
new file mode 100644
index 00000000..94a634fe
--- /dev/null
+++ b/openselfsup/utils/__init__.py
@@ -0,0 +1,8 @@
+from .alias_multinomial import AliasMethod
+from .collect import nondist_forward_collect, dist_forward_collect
+from .collect_env import collect_env
+from .config_tools import traverse_replace
+from .flops_counter import get_model_complexity_info
+from .logger import get_root_logger, print_log
+from .registry import Registry, build_from_cfg
+from . import optimizers
diff --git a/openselfsup/utils/alias_multinomial.py b/openselfsup/utils/alias_multinomial.py
new file mode 100644
index 00000000..bad70bc5
--- /dev/null
+++ b/openselfsup/utils/alias_multinomial.py
@@ -0,0 +1,66 @@
+import torch
+import numpy as np
+
+
+class AliasMethod(object):
+    '''
+        From: https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
+    '''
+
+    def __init__(self, probs):
+
+        if probs.sum() > 1:
+            probs.div_(probs.sum())
+        K = len(probs)
+        self.prob = torch.zeros(K)
+        self.alias = torch.LongTensor([0] * K)
+
+        # Sort the data into the outcomes with probabilities
+        # that are larger and smaller than 1/K.
+        smaller = []
+        larger = []
+        for kk, prob in enumerate(probs):
+            self.prob[kk] = K * prob
+            if self.prob[kk] < 1.0:
+                smaller.append(kk)
+            else:
+                larger.append(kk)
+
+        # Loop though and create little binary mixtures that
+        # appropriately allocate the larger outcomes over the
+        # overall uniform mixture.
+        while len(smaller) > 0 and len(larger) > 0:
+            small = smaller.pop()
+            large = larger.pop()
+
+            self.alias[small] = large
+            self.prob[large] = (self.prob[large] - 1.0) + self.prob[small]
+
+            if self.prob[large] < 1.0:
+                smaller.append(large)
+            else:
+                larger.append(large)
+
+        for last_one in smaller + larger:
+            self.prob[last_one] = 1
+
+    def cuda(self):
+        self.prob = self.prob.cuda()
+        self.alias = self.alias.cuda()
+
+    def draw(self, N):
+        '''
+            Draw N samples from multinomial
+        '''
+        K = self.alias.size(0)
+
+        kk = torch.zeros(
+            N, dtype=torch.long, device=self.prob.device).random_(0, K)
+        prob = self.prob.index_select(0, kk)
+        alias = self.alias.index_select(0, kk)
+        # b is whether a random number is greater than q
+        b = torch.bernoulli(prob)
+        oq = kk.mul(b.long())
+        oj = alias.mul((1 - b).long())
+
+        return oq + oj
diff --git a/openselfsup/utils/collect.py b/openselfsup/utils/collect.py
new file mode 100644
index 00000000..b69b1d81
--- /dev/null
+++ b/openselfsup/utils/collect.py
@@ -0,0 +1,83 @@
+import numpy as np
+
+import mmcv
+import torch
+
+from .gather import gather_tensors_batch
+
+
+def nondist_forward_collect(func, data_loader, length):
+    '''Forward and collect network outputs.
+
+    This function performs forward propagation and collects outputs.
+    It can be used to collect results, features, losses, etc.
+
+    Args:
+        func (function): The function to process data. The output must be
+            a dictionary of CPU tensors.
+        length (int): Expected length of output arrays.
+
+    Returns:
+        results_all (dict(np.ndarray)): The concatenated outputs.
+    '''
+    results = []
+    prog_bar = mmcv.ProgressBar(len(data_loader))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = func(**data)
+        results.append(result)
+        prog_bar.update()
+
+    results_all = {}
+    for k in results[0].keys():
+        results_all[k] = np.concatenate(
+            [batch[k].numpy() for batch in results], axis=0)
+        assert results_all[k].shape[0] == length
+    return results_all
+
+
+def dist_forward_collect(func, data_loader, rank, length, ret_rank=-1):
+    '''Forward and collect network outputs in a distributed manner.
+
+    This function performs forward propagation and collects outputs.
+    It can be used to collect results, features, losses, etc.
+
+    Args:
+        func (function): The function to process data. The output must be
+            a dictionary of CPU tensors.
+        rank (int): This process id.
+        length (int): Expected length of output arrays.
+        ret_rank (int): The process that returns.
+            Other processes will return None.
+
+    Returns:
+        results_all (dict(np.ndarray)): The concatenated outputs.
+    '''
+    results = []
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(data_loader))
+    for idx, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = func(**data)  # dict{key: tensor}
+        results.append(result)
+
+        if rank == 0:
+            prog_bar.update()
+
+    results_all = {}
+    for k in results[0].keys():
+        results_cat = np.concatenate([batch[k].numpy() for batch in results],
+                                     axis=0)
+        if ret_rank == -1:
+            results_gathered = gather_tensors_batch(results_cat, part_size=20)
+            results_strip = np.concatenate(results_gathered, axis=0)[:length]
+        else:
+            results_gathered = gather_tensors_batch(
+                results_cat, part_size=20, ret_rank=ret_rank)
+            if rank == ret_rank:
+                results_strip = np.concatenate(
+                    results_gathered, axis=0)[:length]
+            else:
+                results_strip = None
+        results_all[k] = results_strip
+    return results_all
diff --git a/openselfsup/utils/collect_env.py b/openselfsup/utils/collect_env.py
new file mode 100644
index 00000000..9998ac6a
--- /dev/null
+++ b/openselfsup/utils/collect_env.py
@@ -0,0 +1,63 @@
+import os.path as osp
+import subprocess
+import sys
+from collections import defaultdict
+
+import cv2
+import mmcv
+import torch
+import torchvision
+
+import openselfsup
+
+
+def collect_env():
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    '"{}" -V | tail -n1'.format(nvcc), shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = torch.__config__.show()
+
+    env_info['TorchVision'] = torchvision.__version__
+
+    env_info['OpenCV'] = cv2.__version__
+
+    env_info['MMCV'] = mmcv.__version__
+    env_info['OpenSelfSup'] = openselfsup.__version__
+    #from openselfsup.ops import get_compiler_version, get_compiling_cuda_version
+    #env_info['OpenSelfSup Compiler'] = get_compiler_version()
+    #env_info['OpenSelfSup CUDA Compiler'] = get_compiling_cuda_version()
+    return env_info
+
+
+if __name__ == "__main__":
+    for name, val in collect_env().items():
+        print('{}: {}'.format(name, val))
diff --git a/openselfsup/utils/config_tools.py b/openselfsup/utils/config_tools.py
new file mode 100644
index 00000000..93c0b273
--- /dev/null
+++ b/openselfsup/utils/config_tools.py
@@ -0,0 +1,12 @@
+from mmcv import Config
+
+def traverse_replace(d, key, value):
+    if isinstance(d, (dict, Config)):
+        for k, v in d.items():
+            if k == key:
+                d[k] = value
+            else:
+                traverse_replace(v, key, value)
+    elif isinstance(d, (list, tuple, set)):
+        for v in d:
+            traverse_replace(v, key, value)
diff --git a/openselfsup/utils/contextmanagers.py b/openselfsup/utils/contextmanagers.py
new file mode 100644
index 00000000..0363f014
--- /dev/null
+++ b/openselfsup/utils/contextmanagers.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """
+    Async context manager that waits for work to complete on
+    given CUDA streams.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += ' {} {:.2f} ms'.format(stream, elapsed_time)
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/openselfsup/utils/flops_counter.py b/openselfsup/utils/flops_counter.py
new file mode 100644
index 00000000..df2163fd
--- /dev/null
+++ b/openselfsup/utils/flops_counter.py
@@ -0,0 +1,444 @@
+# Modified from flops-counter.pytorch by Vladislav Sovrasov
+# original repo: https://github.com/sovrasov/flops-counter.pytorch
+
+# MIT License
+
+# Copyright (c) 2018 Vladislav Sovrasov
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
+                                      _AvgPoolNd, _MaxPoolNd)
+
+
+def get_model_complexity_info(model,
+                              input_res,
+                              print_per_layer_stat=True,
+                              as_strings=True,
+                              input_constructor=None,
+                              ost=sys.stdout):
+    assert type(input_res) is tuple
+    assert len(input_res) >= 2
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval().start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_res)
+        _ = flops_model(**input)
+    else:
+        batch = torch.ones(()).new_empty(
+            (1, *input_res),
+            dtype=next(flops_model.parameters()).dtype,
+            device=next(flops_model.parameters()).device)
+        flops_model(batch)
+
+    if print_per_layer_stat:
+        print_model_with_flops(flops_model, ost=ost)
+    flops_count = flops_model.compute_average_flops_cost()
+    params_count = get_model_parameters_number(flops_model)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops, units='GMac', precision=2):
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GMac'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MMac'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KMac'
+        else:
+            return str(flops) + ' Mac'
+    else:
+        if units == 'GMac':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MMac':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KMac':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' Mac'
+
+
+def params_to_string(params_num):
+    """converting number to string
+
+    :param float params_num: number
+    :returns str: number
+
+    >>> params_to_string(1e9)
+    '1000.0 M'
+    >>> params_to_string(2e5)
+    '200.0 k'
+    >>> params_to_string(3e-9)
+    '3e-09'
+    """
+    if params_num // 10**6 > 0:
+        return str(round(params_num / 10**6, 2)) + ' M'
+    elif params_num // 10**3:
+        return str(round(params_num / 10**3, 2)) + ' k'
+    else:
+        return str(params_num)
+
+
+def print_model_with_flops(model, units='GMac', precision=3, ost=sys.stdout):
+    total_flops = model.compute_average_flops_cost()
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            '{:.3%} MACs'.format(accumulated_flops_cost / total_flops),
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model):
+    params_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return params_num
+
+
+def add_flops_counting_methods(net_main_module):
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(
+        net_main_module)
+    net_main_module.compute_average_flops_cost = \
+        compute_average_flops_cost.__get__(net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    # Adding variables necessary for masked flops computation
+    net_main_module.apply(add_flops_mask_variable_or_reset)
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self):
+    """
+    A method that will be available after add_flops_counting_methods() is
+    called on a desired net object.
+    Returns current mean flops consumption per image.
+    """
+
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+
+    return flops_sum / batches_count
+
+
+def start_flops_count(self):
+    """
+    A method that will be available after add_flops_counting_methods() is
+    called on a desired net object.
+    Activates the computation of mean flops consumption per image.
+    Call it before you run the network.
+    """
+    add_batch_counter_hook_function(self)
+    self.apply(add_flops_counter_hook_function)
+
+
+def stop_flops_count(self):
+    """
+    A method that will be available after add_flops_counting_methods() is
+    called on a desired net object.
+    Stops computing the mean flops consumption per image.
+    Call whenever you want to pause the computation.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self):
+    """
+    A method that will be available after add_flops_counting_methods() is
+    called on a desired net object.
+    Resets statistics computed so far.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+def add_flops_mask(module, mask):
+
+    def add_flops_mask_func(module):
+        if isinstance(module, torch.nn.Conv2d):
+            module.__mask__ = mask
+
+    module.apply(add_flops_mask_func)
+
+
+def remove_flops_mask(module):
+    module.apply(add_flops_mask_variable_or_reset)
+
+
+def is_supported_instance(module):
+    for mod in hook_mapping:
+        if issubclass(type(module), mod):
+            return True
+    return False
+
+
+def empty_flops_counter_hook(module, input, output):
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module, input, output):
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module, input, output):
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module, input, output):
+    input = input[0]
+    batch_size = input.shape[0]
+    module.__flops__ += int(batch_size * input.shape[1] * output.shape[1])
+
+
+def pool_flops_counter_hook(module, input, output):
+    input = input[0]
+    module.__flops__ += int(np.prod(input.shape))
+
+
+def bn_flops_counter_hook(module, input, output):
+    input = input[0]
+
+    batch_flops = np.prod(input.shape)
+    if module.affine:
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def gn_flops_counter_hook(module, input, output):
+    elems = np.prod(input[0].shape)
+    # there is no precise FLOPs estimation of computing mean and variance,
+    # and we just set it 2 * elems: half muladds for computing
+    # means and half for computing vars
+    batch_flops = 3 * elems
+    if module.affine:
+        batch_flops += elems
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    input_height, input_width = input.shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_height
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = np.prod(
+        kernel_dims) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * np.prod(output_dims)
+
+    if conv_module.__mask__ is not None:
+        # (b, 1, h, w)
+        output_height, output_width = output.shape[2:]
+        flops_mask = conv_module.__mask__.expand(batch_size, 1, output_height,
+                                                 output_width)
+        active_elements_count = flops_mask.sum()
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+hook_mapping = {
+    # conv
+    _ConvNd: conv_flops_counter_hook,
+    # deconv
+    _ConvTransposeMixin: deconv_flops_counter_hook,
+    # fc
+    nn.Linear: linear_flops_counter_hook,
+    # pooling
+    _AvgPoolNd: pool_flops_counter_hook,
+    _MaxPoolNd: pool_flops_counter_hook,
+    _AdaptiveAvgPoolNd: pool_flops_counter_hook,
+    _AdaptiveMaxPoolNd: pool_flops_counter_hook,
+    # activation
+    nn.ReLU: relu_flops_counter_hook,
+    nn.PReLU: relu_flops_counter_hook,
+    nn.ELU: relu_flops_counter_hook,
+    nn.LeakyReLU: relu_flops_counter_hook,
+    nn.ReLU6: relu_flops_counter_hook,
+    # normalization
+    _BatchNorm: bn_flops_counter_hook,
+    nn.GroupNorm: gn_flops_counter_hook,
+    # upsample
+    nn.Upsample: upsample_flops_counter_hook,
+}
+
+
+def batch_counter_hook(module, input, output):
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        input = input[0]
+        batch_size = len(input)
+    else:
+        print('Warning! No positional inputs found for a module, '
+              'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module):
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module):
+    if is_supported_instance(module):
+        module.__flops__ = 0
+
+
+def add_flops_counter_hook_function(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            return
+
+        for mod_type, counter_hook in hook_mapping.items():
+            if issubclass(type(module), mod_type):
+                handle = module.register_forward_hook(counter_hook)
+                break
+
+        module.__flops_handle__ = handle
+
+
+def remove_flops_counter_hook_function(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+# --- Masked flops counting
+# Also being run in the initialization
+def add_flops_mask_variable_or_reset(module):
+    if is_supported_instance(module):
+        module.__mask__ = None
diff --git a/openselfsup/utils/gather.py b/openselfsup/utils/gather.py
new file mode 100644
index 00000000..8109f0b9
--- /dev/null
+++ b/openselfsup/utils/gather.py
@@ -0,0 +1,69 @@
+import numpy as np
+
+import torch
+import torch.distributed as dist
+
+
+def gather_tensors(input_array):
+    world_size = dist.get_world_size()
+    ## gather shapes first
+    myshape = input_array.shape
+    mycount = input_array.size
+    shape_tensor = torch.Tensor(np.array(myshape)).cuda()
+    all_shape = [
+        torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)
+    ]
+    dist.all_gather(all_shape, shape_tensor)
+    ## compute largest shapes
+    all_shape = [x.cpu().numpy() for x in all_shape]
+    all_count = [int(x.prod()) for x in all_shape]
+    all_shape = [list(map(int, x)) for x in all_shape]
+    max_count = max(all_count)
+    ## padding tensors and gather them
+    output_tensors = [
+        torch.Tensor(max_count).cuda() for i in range(world_size)
+    ]
+    padded_input_array = np.zeros(max_count)
+    padded_input_array[:mycount] = input_array.reshape(-1)
+    input_tensor = torch.Tensor(padded_input_array).cuda()
+    dist.all_gather(output_tensors, input_tensor)
+    ## unpadding gathered tensors
+    padded_output = [x.cpu().numpy() for x in output_tensors]
+    output = [
+        x[:all_count[i]].reshape(all_shape[i])
+        for i, x in enumerate(padded_output)
+    ]
+    return output
+
+
+def gather_tensors_batch(input_array, part_size=100, ret_rank=-1):
+    # batch-wize gathering to avoid CUDA out of memory
+    rank = dist.get_rank()
+    all_features = []
+    part_num = input_array.shape[0] // part_size + 1 if input_array.shape[
+        0] % part_size != 0 else input_array.shape[0] // part_size
+    for i in range(part_num):
+        part_feat = input_array[i *
+                                part_size:min((i + 1) *
+                                              part_size, input_array.shape[0]),
+                                ...]
+        assert part_feat.shape[
+            0] > 0, "rank: {}, length of part features should > 0".format(rank)
+        #print("rank: {}, gather part: {}/{}, length: {}".format(rank, i, part_num, len(part_feat)))
+        gather_part_feat = gather_tensors(part_feat)
+        all_features.append(gather_part_feat)
+    if ret_rank == -1:
+        all_features = [
+            np.concatenate([all_features[i][j] for i in range(part_num)],
+                           axis=0) for j in range(len(all_features[0]))
+        ]
+        return all_features
+    else:
+        if rank == ret_rank:
+            all_features = [
+                np.concatenate([all_features[i][j] for i in range(part_num)],
+                               axis=0) for j in range(len(all_features[0]))
+            ]
+            return all_features
+        else:
+            return None
diff --git a/openselfsup/utils/logger.py b/openselfsup/utils/logger.py
new file mode 100644
index 00000000..73f9891c
--- /dev/null
+++ b/openselfsup/utils/logger.py
@@ -0,0 +1,66 @@
+import logging
+
+from mmcv.runner import get_dist_info
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "openselfsup".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+    logger = logging.getLogger(__name__.split('.')[0])  # i.e., openselfsup
+    # if the logger has been initialized, just return it
+    if logger.hasHandlers():
+        return logger
+
+    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(format=format_str, level=log_level)
+    rank, _ = get_dist_info()
+    if rank != 0:
+        logger.setLevel('ERROR')
+    elif log_file is not None:
+        file_handler = logging.FileHandler(log_file, 'w')
+        file_handler.setFormatter(logging.Formatter(format_str))
+        file_handler.setLevel(log_level)
+        logger.addHandler(file_handler)
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used. Some
+            special loggers are:
+            - "root": the root logger obtained with `get_root_logger()`.
+            - "silent": no message will be printed.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif logger == 'root':
+        _logger = get_root_logger()
+        _logger.log(level, msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger != 'silent':
+        raise TypeError(
+            'logger should be either a logging.Logger object, "root", '
+            '"silent" or None, but got {}'.format(logger))
diff --git a/openselfsup/utils/misc.py b/openselfsup/utils/misc.py
new file mode 100644
index 00000000..262f168e
--- /dev/null
+++ b/openselfsup/utils/misc.py
@@ -0,0 +1,37 @@
+from functools import partial
+
+import mmcv
+import numpy as np
+from six.moves import map, zip
+
+
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
+
+
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds, :] = data
+    return ret
diff --git a/openselfsup/utils/optimizers.py b/openselfsup/utils/optimizers.py
new file mode 100644
index 00000000..8e756ea5
--- /dev/null
+++ b/openselfsup/utils/optimizers.py
@@ -0,0 +1,95 @@
+""" Layer-wise adaptive rate scaling for SGD in PyTorch! """
+import torch
+from torch.optim.optimizer import Optimizer, required
+from torch.optim import *
+
+
+class LARS(Optimizer):
+    r"""Implements layer-wise adaptive rate scaling for SGD.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): base learning rate (\gamma_0)
+        momentum (float, optional): momentum factor (default: 0) ("m")
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            ("\beta")
+        eta (float, optional): LARS coefficient
+        max_epoch: maximum training epoch to determine polynomial LR decay.
+
+    Based on Algorithm 1 of the following paper by You, Gitman, and Ginsburg.
+    Large Batch Training of Convolutional Networks:
+        https://arxiv.org/abs/1708.03888
+
+    Example:
+        >>> optimizer = LARS(model.parameters(), lr=0.1, eta=1e-3)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+    """
+
+    def __init__(self,
+                 params,
+                 lr=required,
+                 momentum=.9,
+                 weight_decay=.0005,
+                 eta=0.001):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError(
+                "Invalid weight_decay value: {}".format(weight_decay))
+        if eta < 0.0:
+            raise ValueError("Invalid LARS coefficient value: {}".format(eta))
+
+        defaults = dict(
+            lr=lr, momentum=momentum, weight_decay=weight_decay, eta=eta)
+        super(LARS, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            epoch: current epoch to calculate polynomial LR decay schedule.
+                   if None, uses self.epoch and increments it.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            eta = group['eta']
+            lr = group['lr']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                param_state = self.state[p]
+                d_p = p.grad.data
+
+                weight_norm = torch.norm(p.data)
+                grad_norm = torch.norm(d_p)
+
+                # Compute local learning rate for this layer
+                local_lr = eta * weight_norm / \
+                    (grad_norm + weight_decay * weight_norm)
+
+                # Update the momentum term
+                actual_lr = local_lr * lr
+
+                if 'momentum_buffer' not in param_state:
+                    buf = param_state['momentum_buffer'] = \
+                            torch.zeros_like(p.data)
+                else:
+                    buf = param_state['momentum_buffer']
+                buf.mul_(momentum).add_(actual_lr, d_p + weight_decay * p.data)
+                p.data.add_(-buf)
+
+        return loss
diff --git a/openselfsup/utils/profiling.py b/openselfsup/utils/profiling.py
new file mode 100644
index 00000000..58b1c87d
--- /dev/null
+++ b/openselfsup/utils/profiling.py
@@ -0,0 +1,41 @@
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of
+        code suitable for async implementation.
+
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = "{} {} cpu_time {:.2f} ms ".format(trace_name, name,
+                                                     cpu_time)
+            msg += "gpu_time {:.2f} ms stream {}".format(gpu_time, stream)
+            print(msg, end_stream)
diff --git a/openselfsup/utils/registry.py b/openselfsup/utils/registry.py
new file mode 100644
index 00000000..4ad9f876
--- /dev/null
+++ b/openselfsup/utils/registry.py
@@ -0,0 +1,79 @@
+import inspect
+from functools import partial
+
+import mmcv
+
+
+class Registry(object):
+
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    def get(self, key):
+        return self._module_dict.get(key, None)
+
+    def _register_module(self, module_class, force=False):
+        """Register a module.
+
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isclass(module_class):
+            raise TypeError('module must be a class, but got {}'.format(
+                type(module_class)))
+        module_name = module_class.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_class
+
+    def register_module(self, cls=None, force=False):
+        if cls is None:
+            return partial(self.register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+
+def build_from_cfg(cfg, registry, default_args=None):
+    """Build a module from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        obj: The constructed object.
+    """
+    assert isinstance(cfg, dict) and 'type' in cfg
+    assert isinstance(default_args, dict) or default_args is None
+    args = cfg.copy()
+    obj_type = args.pop('type')
+    if mmcv.is_str(obj_type):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError('{} is not in the {} registry'.format(
+                obj_type, registry.name))
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError('type must be a str or valid type, but got {}'.format(
+            type(obj_type)))
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    return obj_cls(**args)
diff --git a/openselfsup/version.py b/openselfsup/version.py
new file mode 100644
index 00000000..5ddd8800
--- /dev/null
+++ b/openselfsup/version.py
@@ -0,0 +1,5 @@
+# GENERATED VERSION FILE
+# TIME: Tue Jun 16 00:02:37 2020
+
+__version__ = '0.1.0+HEAD'
+short_version = '0.1.0'
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..ec4ca05e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+-r requirements/runtime.txt
+-r requirements/tests.txt
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
new file mode 100644
index 00000000..54028248
--- /dev/null
+++ b/requirements/runtime.txt
@@ -0,0 +1,12 @@
+matplotlib
+mmcv>=0.3.1
+numpy
+# need older pillow until torchvision is fixed
+Pillow<=6.2.2
+six
+terminaltables
+sklearn
+faiss-gpu==1.6.1
+tensorboard
+future
+tqdm
diff --git a/requirements/tests.txt b/requirements/tests.txt
new file mode 100644
index 00000000..d45e5409
--- /dev/null
+++ b/requirements/tests.txt
@@ -0,0 +1,11 @@
+asynctest
+codecov
+flake8
+isort
+pytest 
+pytest-cov
+pytest-runner
+xdoctest >= 0.10.0
+yapf
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..ea3d3a53
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+import os
+import subprocess
+import time
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+MAJOR = 0
+MINOR = 1
+PATCH = 0
+SUFFIX = ''
+if PATCH != '':
+    SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+else:
+    SHORT_VERSION = '{}.{}{}'.format(MAJOR, MINOR, SUFFIX)
+
+version_file = 'openselfsup/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from openselfsup.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+
+__version__ = '{}'
+short_version = '{}'
+"""
+    sha = get_hash()
+    VERSION = SHORT_VERSION + '+' + sha
+
+    with open(version_file, 'w') as f:
+        f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """
+    Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    import re
+    require_fpath = fname
+
+    def parse_line(line):
+        """
+        Parse information from a line in a requirements text file
+        """
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='openselfsup',
+        version=get_version(),
+        description='Self-Supervision Toolbox and Benchmark',
+        long_description=readme(),
+        author='Xiaohang Zhan',
+        author_email='xiaohangzhan@outlook.com',
+        keywords='unsupervised learning, self-supervised learning',
+        url='https://github.com/open-mmlab/openselfsup',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        setup_requires=parse_requirements('requirements/build.txt'),
+        tests_require=parse_requirements('requirements/tests.txt'),
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        zip_safe=False)
diff --git a/tools/count_parameters.py b/tools/count_parameters.py
new file mode 100644
index 00000000..5681a82d
--- /dev/null
+++ b/tools/count_parameters.py
@@ -0,0 +1,38 @@
+import argparse
+from mmcv import Config
+
+from openselfsup.models import build_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a model')
+    parser.add_argument('config', help='train config file path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    model = build_model(cfg.model)
+
+    num_params = sum(p.numel() for p in model.parameters()) / 1024. / 1024.
+    num_grad_params = sum(p.numel() for p in model.parameters() \
+        if p.requires_grad) / 1024. / 1024.
+    num_backbone_params = sum(
+        p.numel() for p in model.backbone.parameters()) / 1024. / 1024.
+    num_backbone_grad_params = sum(p.numel() for p in model.backbone.parameters() \
+        if p.requires_grad) / 1024. / 1024.
+    print(
+        "Number of backbone parameters: {:.5g} M".format(num_backbone_params))
+    print("Number of backbone parameters requiring grad: {:.5g} M".format(
+        num_backbone_grad_params))
+    print("Number of total parameters: {:.5g} M".format(num_params))
+    print("Number of total parameters requiring grad: {:.5g} M".format(
+        num_grad_params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dist_extract.sh b/tools/dist_extract.sh
new file mode 100755
index 00000000..0a30540c
--- /dev/null
+++ b/tools/dist_extract.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+PYTHON=${PYTHON:-"python"}
+CFG=$1
+CHECKPOINT=$2
+GPUS=${3:-8}
+PORT=${PORT:-29500}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+if [ "$CHECKPOINT" == "" ]; then
+    $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+        tools/extract.py $CFG --layer-ind "0,1,2,3,4" --work_dir $WORK_DIR --launcher pytorch
+else
+    $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+        tools/extract.py $CFG --layer-ind "0,1,2,3,4" --checkpoint $CHECKPOINT \
+        --work_dir $WORK_DIR --launcher pytorch
+fi
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100755
index 00000000..a54b7dda
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+PYTHON=${PYTHON:-"python"}
+
+CFG=$1
+GPUS=$2
+PORT=${PORT:-29500}
+PY_ARGS=${@:3}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+$PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    tools/train.py $CFG --work_dir $WORK_DIR --seed 0 --launcher pytorch ${PY_ARGS}
diff --git a/tools/extract.py b/tools/extract.py
new file mode 100644
index 00000000..b6881046
--- /dev/null
+++ b/tools/extract.py
@@ -0,0 +1,160 @@
+import argparse
+import importlib
+import numpy as np
+import os
+import os.path as osp
+import time
+
+import mmcv
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+
+from openselfsup.utils import dist_forward_collect, nondist_forward_collect
+from openselfsup.datasets import build_dataloader, build_dataset
+from openselfsup.models import build_model
+from openselfsup.models.utils import MultiPooling
+from openselfsup.utils import get_root_logger
+
+
+class ExtractProcess(object):
+
+    def __init__(self,
+                 pool_type='specified',
+                 backbone='resnet50',
+                 layer_indices=(0, 1, 2, 3, 4)):
+        self.multi_pooling = MultiPooling(
+            pool_type, in_indices=layer_indices, backbone=backbone)
+
+    def _forward_func(self, model, **x):
+        backbone_feats = model(mode='extract', **x)
+        pooling_feats = self.multi_pooling(backbone_feats)
+        flat_feats = [xx.view(xx.size(0), -1) for xx in pooling_feats]
+        feat_dict = {'feat{}'.format(i + 1): feat.cpu() \
+            for i, feat in enumerate(flat_feats)}
+        return feat_dict
+
+    def extract(self, model, data_loader, distributed=False):
+        model.eval()
+        func = lambda **x: self._forward_func(model, **x)
+        if distributed:
+            rank, world_size = get_dist_info()
+            results = dist_forward_collect(func, data_loader, rank,
+                                           len(data_loader.dataset))
+        else:
+            results = nondist_forward_collect(func, data_loader,
+                                              len(data_loader.dataset))
+        return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='OpenSelfSup extract features of a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument(
+        '--dataset-config',
+        default='benchmarks/extract_info/voc07.py',
+        help='extract dataset config file path')
+    parser.add_argument(
+        '--layer-ind',
+        type=str,
+        help='layer indices, separated by comma, e.g., "0,1,2,3,4"')
+    parser.add_argument(
+        '--work_dir',
+        type=str,
+        default=None,
+        help='the dir to save logs and models')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = mmcv.Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # update configs according to CLI args
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    layer_ind = [int(idx) for idx in args.layer_ind.split(',')]
+    cfg.model.backbone.out_indices = layer_ind
+
+    if args.checkpoint is None:
+        assert cfg.model.pretrained is not None, \
+            "Must have pretrain if no checkpoint is given."
+
+    # check memcached package exists
+    if importlib.util.find_spec('mc') is None:
+        for field in ['train', 'val', 'test']:
+            if hasattr(cfg.data, field):
+                getattr(cfg.data, field).data_source.memcached = False
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # logger
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, 'extract_{}.log'.format(timestamp))
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # build the dataloader
+    dataset_cfg = mmcv.Config.fromfile(args.dataset_config)
+    dataset = build_dataset(dataset_cfg.data.extract)
+    data_loader = build_dataloader(
+        dataset,
+        imgs_per_gpu=dataset_cfg.data.imgs_per_gpu,
+        workers_per_gpu=dataset_cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_model(cfg.model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+
+    # build extraction processor
+    extractor = ExtractProcess(
+        pool_type='specified', backbone='resnet50', layer_indices=layer_ind)
+
+    # run
+    outputs = extractor.extract(model, data_loader, distributed=distributed)
+    rank, _ = get_dist_info()
+    mmcv.mkdir_or_exist("{}/features/".format(args.work_dir))
+    if rank == 0:
+        for key, val in outputs.items():
+            split_num = len(dataset_cfg.split_name)
+            split_at = dataset_cfg.split_at
+            for ss in range(split_num):
+                output_file = "{}/features/{}_{}.npy".format(
+                    args.work_dir, dataset_cfg.split_name[ss], key)
+                if ss == 0:
+                    np.save(output_file, val[:split_at[0]])
+                elif ss == split_num - 1:
+                    np.save(output_file, val[split_at[-1]:])
+                else:
+                    np.save(output_file, val[split_at[ss - 1]:split_at[ss]])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/extract_backbone_weights.py b/tools/extract_backbone_weights.py
new file mode 100644
index 00000000..733a47a2
--- /dev/null
+++ b/tools/extract_backbone_weights.py
@@ -0,0 +1,34 @@
+import torch
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='This script extracts backbone weights from a checkpoint')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--save-path', type=str, default=None, help='destination file name')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.save_path is None:
+        args.save_path = args.checkpoint[:-4] + "_extracted.pth"
+    ck = torch.load(args.checkpoint, map_location=torch.device('cpu'))
+    output_dict = dict(state_dict=dict(), author="OpenSelfSup")
+    has_backbone = False
+    for key, value in ck['state_dict'].items():
+        if key.startswith('backbone'):
+            output_dict['state_dict'][key[9:]] = value
+            has_backbone = True
+        #elif key.startswith('encoder_q.0'):
+        #    output_dict['state_dict'][key[12:]] = value
+    if not has_backbone:
+        raise Exception("Cannot find a backbone module in the checkpoint.")
+    torch.save(output_dict, args.save_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/kill.sh b/tools/kill.sh
new file mode 100644
index 00000000..14cac347
--- /dev/null
+++ b/tools/kill.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+kill $(ps aux | grep "train.py" | grep -v grep | awk '{print $2}')
diff --git a/tools/prepare_data/create_voc_data_files.py b/tools/prepare_data/create_voc_data_files.py
new file mode 100644
index 00000000..3249cff0
--- /dev/null
+++ b/tools/prepare_data/create_voc_data_files.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+This script can be used to extract the VOC2007 and VOC2012 dataset files
+[data, labels] from the given annotations that can be used for training. The
+files can be prepared for various data splits
+"""
+
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import argparse
+import logging
+import numpy as np
+import os
+import sys
+from glob import glob
+
+# initiate the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def validate_files(input_files):
+    """
+    The valid files will have name: <class_name>_<split>.txt. We want to remove
+    all the other files from the input.
+    """
+    output_files = []
+    for item in input_files:
+        if len(item.split('/')[-1].split('_')) == 2:
+            output_files.append(item)
+    return output_files
+
+
+def get_data_files(split, args):
+    data_dir = os.path.join(args.data_source_dir, 'ImageSets/Main')
+    assert os.path.exists(data_dir), "Data: {} doesn't exist".format(data_dir)
+    test_data_files = glob(os.path.join(data_dir, '*_test.txt'))
+    test_data_files = validate_files(test_data_files)
+    if args.separate_partitions > 0:
+        train_data_files = glob(os.path.join(data_dir, '*_train.txt'))
+        val_data_files = glob(os.path.join(data_dir, '*_val.txt'))
+        train_data_files = validate_files(train_data_files)
+        val_data_files = validate_files(val_data_files)
+        assert len(train_data_files) == len(val_data_files)
+        if split == 'train':
+            data_files = train_data_files
+        elif split == 'test':
+            data_files = test_data_files
+        else:
+            data_files = val_data_files
+    else:
+        train_data_files = glob(os.path.join(data_dir, '*_trainval.txt'))
+        if len(test_data_files) == 0:
+            # For VOC2012 dataset, we have trainval, val and train data.
+            train_data_files = glob(os.path.join(data_dir, '*_train.txt'))
+            test_data_files = glob(os.path.join(data_dir, '*_val.txt'))
+        test_data_files = validate_files(test_data_files)
+        train_data_files = validate_files(train_data_files)
+        data_files = train_data_files if (split
+                                          == 'train') else test_data_files
+    assert len(train_data_files) == len(test_data_files), "Missing classes"
+    return data_files
+
+
+def get_images_labels_info(split, args):
+    assert os.path.exists(args.data_source_dir), "Data source NOT found. Abort"
+
+    data_files = get_data_files(split, args)
+    # we will construct a map for image name to the vector of -1, 0, 1
+    # we sort the data_files which gives sorted class names as well
+    img_labels_map = {}
+    for cls_num, data_path in enumerate(sorted(data_files)):
+        # for this class, we have images and each image will have label
+        # 1, -1, 0 -> present, not present, ignore respectively as in VOC data.
+        with open(data_path, 'r') as fopen:
+            for line in fopen:
+                try:
+                    img_name, orig_label = line.strip().split()
+                    if img_name not in img_labels_map:
+                        img_labels_map[img_name] = -np.ones(
+                            len(data_files), dtype=np.int32)
+                    orig_label = int(orig_label)
+                    # in VOC data, -1 (not present), set it to 0 as train target
+                    if orig_label == -1:
+                        orig_label = 0
+                    # in VOC data, 0 (ignore), set it to -1 as train target
+                    elif orig_label == 0:
+                        orig_label = -1
+                    img_labels_map[img_name][cls_num] = orig_label
+                except Exception:
+                    logger.info('Error processing: {} data_path: {}'.format(
+                        line, data_path))
+
+    img_paths, img_labels = [], []
+    for item in sorted(img_labels_map.keys()):
+        img_paths.append(
+            os.path.join(args.data_source_dir, 'JPEGImages', item + '.jpg'))
+        img_labels.append(img_labels_map[item])
+
+    output_dict = {}
+    if args.generate_json:
+        cls_names = []
+        for item in sorted(data_files):
+            name = item.split('/')[-1].split('.')[0].split('_')[0]
+            cls_names.append(name)
+
+        img_ids, json_img_labels = [], []
+        for item in sorted(img_labels_map.keys()):
+            img_ids.append(item)
+            json_img_labels.append(img_labels_map[item])
+
+        for img_idx in range(len(img_ids)):
+            img_id = img_ids[img_idx]
+            out_lbl = {}
+            for cls_idx in range(len(cls_names)):
+                name = cls_names[cls_idx]
+                out_lbl[name] = int(json_img_labels[img_idx][cls_idx])
+            output_dict[img_id] = out_lbl
+    return img_paths, img_labels, output_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Create VOC data files")
+    parser.add_argument(
+        '--data_source_dir',
+        type=str,
+        default=None,
+        help="Path to data directory containing ImageSets and JPEGImages")
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default=None,
+        help="Output directory where images/label information will be written")
+    parser.add_argument(
+        '--separate_partitions',
+        type=int,
+        default=0,
+        help="Whether to create files separately for partitions train/test/val"
+    )
+    parser.add_argument(
+        '--generate_json',
+        type=int,
+        default=0,
+        help="Whether to json files for partitions train/test/val")
+    args = parser.parse_args()
+
+    # given the data directory for the partitions train, val, and test, we will
+    # write numpy files for each partition.
+    partitions = ['train', 'test']
+    if args.separate_partitions > 0:
+        partitions.append('val')
+
+    for partition in partitions:
+        logger.info(
+            '========Preparing {} data files========'.format(partition))
+        imgs_info, lbls_info, output_dict = get_images_labels_info(
+            partition, args)
+        img_info_out_path = os.path.join(args.output_dir,
+                                         partition + '_images.npy')
+        label_info_out_path = os.path.join(args.output_dir,
+                                           partition + '_labels.npy')
+        logger.info(
+            '=================SAVING DATA files=======================')
+        logger.info('partition: {} saving img_paths to: {}'.format(
+            partition, img_info_out_path))
+        logger.info('partition: {} saving lbls_paths: {}'.format(
+            partition, label_info_out_path))
+        logger.info('partition: {} imgs: {}'.format(partition,
+                                                    np.array(imgs_info).shape))
+        np.save(img_info_out_path, np.array(imgs_info))
+        np.save(label_info_out_path, np.array(lbls_info))
+        if args.generate_json:
+            json_out_path = os.path.join(args.output_dir,
+                                         partition + '_targets.json')
+            import json
+            with open(json_out_path, 'w') as fp:
+                json.dump(output_dict, fp)
+            logger.info('Saved Json to: {}'.format(json_out_path))
+    logger.info('DONE!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/prepare_data/create_voc_low_shot_challenge_samples.py b/tools/prepare_data/create_voc_low_shot_challenge_samples.py
new file mode 100644
index 00000000..34da1a2b
--- /dev/null
+++ b/tools/prepare_data/create_voc_low_shot_challenge_samples.py
@@ -0,0 +1,131 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+This script is used to create the low-shot data for VOC svm trainings.
+"""
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+
+import argparse
+import json
+import logging
+import numpy as np
+import os
+import random
+import sys
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def load_json(file_path, ground_truth=True):
+    import json
+    assert os.path.exists(file_path), "{} does not exist".format(file_path)
+    with open(file_path, 'r') as fp:
+        data = json.load(fp)
+    img_ids = sorted(list(data.keys()))
+    cls_names = sorted(list(data[img_ids[0]].keys()))
+    if ground_truth:
+        output = np.empty((len(img_ids), len(cls_names)), dtype=np.int32)
+    else:
+        output = np.empty((len(img_ids), len(cls_names)), dtype=np.float64)
+    for idx in range(len(img_ids)):
+        for cls_idx in range(len(cls_names)):
+            output[idx][cls_idx] = data[img_ids[idx]][cls_names[cls_idx]]
+    return output, img_ids, cls_names
+
+
+def save_json(input_data, img_ids, cls_names, output_file):
+    output_dict = {}
+    for img_idx in range(len(img_ids)):
+        img_id = img_ids[img_idx]
+        out_lbl = {}
+        for cls_idx in range(len(cls_names)):
+            name = cls_names[cls_idx]
+            out_lbl[name] = int(input_data[img_idx][cls_idx])
+        output_dict[img_id] = out_lbl
+    logger.info('Saving file: {}'.format(output_file))
+    with open(output_file, 'w') as fp:
+        json.dump(output_dict, fp)
+
+
+def sample_symbol(input_targets, output_target, symbol, num):
+    logger.info('Sampling symbol: {} for num: {}'.format(symbol, num))
+    num_classes = input_targets.shape[1]
+    for idx in range(num_classes):
+        symbol_data = np.where(input_targets[:, idx] == symbol)[0]
+        sampled = random.sample(list(symbol_data), num)
+        for index in sampled:
+            output_target[index, idx] = symbol
+    return output_target
+
+
+def generate_independent_sample(opts, targets, img_ids, cls_names):
+    k_values = [int(val) for val in opts.k_values.split(",")]
+    # the way sample works is: for each independent sample, and a given k value
+    # we create a matrix of the same shape as given targets file. We initialize
+    # this matrix with -1 (ignore label). We then sample k positive and
+    # (num_classes-1) * k negatives.
+    # N x 20 shape
+    num_classes = targets.shape[1]
+    for idx in range(opts.num_samples):
+        for k in k_values:
+            logger.info('Sampling: {} time for k-value: {}'.format(idx + 1, k))
+            output = np.ones(targets.shape, dtype=np.int32) * -1
+            output = sample_symbol(targets, output, 1, k)
+            output = sample_symbol(targets, output, 0, (num_classes - 1) * k)
+            prefix = opts.targets_data_file.split('/')[-1].split('.')[0]
+            output_file = os.path.join(
+                opts.output_path,
+                '{}_sample{}_k{}.json'.format(prefix, idx + 1, k))
+            save_json(output, img_ids, cls_names, output_file)
+            npy_output_file = os.path.join(
+                opts.output_path,
+                '{}_sample{}_k{}.npy'.format(prefix, idx + 1, k))
+            logger.info('Saving npy file: {}'.format(npy_output_file))
+            np.save(npy_output_file, output)
+    logger.info('Done!!')
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Sample Low shot data for VOC')
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Json file containing image labels")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where low-shot samples should be saved")
+    parser.add_argument(
+        '--k_values',
+        type=str,
+        default="1,2,4,8,16,32,64,96",
+        help="Low-shot k-values for svm testing.")
+    parser.add_argument(
+        '--num_samples',
+        type=int,
+        default=5,
+        help="Number of independent samples.")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    opts = parser.parse_args()
+    targets, img_ids, cls_names = load_json(opts.targets_data_file)
+    generate_independent_sample(opts, targets, img_ids, cls_names)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/prepare_data/prepare_voc07_cls.sh b/tools/prepare_data/prepare_voc07_cls.sh
new file mode 100644
index 00000000..799239d2
--- /dev/null
+++ b/tools/prepare_data/prepare_voc07_cls.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+DATA="$1"
+if [ "$DATA" == "" ]; then
+    echo "Usage: bash tools/prepare_data/prepare_voc07_cls.sh YOUR_DATA_ROOT"
+    exit
+fi
+
+VOC="$DATA/VOCdevkit/VOC2007/"
+
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar -P $DATA
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar -P $DATA
+tar -xf $DATA/VOCtrainval_06-Nov-2007.tar -C $DATA
+tar -xf $DATA/VOCtest_06-Nov-2007.tar -C $DATA
+
+mkdir -p $VOC/SVMLabels/low_shot/labels/
+
+python $(dirname "$0")/create_voc_data_files.py \
+    --data_source_dir $VOC \
+    --output_dir $VOC/SVMLabels/ \
+    --generate_json 1
+
+python $(dirname "$0")/create_voc_low_shot_challenge_samples.py \
+    --targets_data_file $VOC/SVMLabels/train_targets.json \
+    --output_path $VOC/SVMLabels/low_shot/labels/ \
+    --k_values "1,2,4,8,16,32,64,96" \
+    --num_samples 5
+
+mkdir $VOC/Lists
+
+awk 'NF{print $0 ".jpg"}' $VOC/ImageSets/Main/trainval.txt $VOC/ImageSets/Main/test.txt > $VOC/Lists/trainvaltest.txt
+
+mkdir data/
+ln -s $DATA/VOCdevkit data/
diff --git a/tools/publish_model.py b/tools/publish_model.py
new file mode 100644
index 00000000..4dd35332
--- /dev/null
+++ b/tools/publish_model.py
@@ -0,0 +1,34 @@
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, in_file + ".tmp.pth")
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', in_file + ".tmp.pth", final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.in_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/single_train.sh b/tools/single_train.sh
new file mode 100644
index 00000000..84e0c3c4
--- /dev/null
+++ b/tools/single_train.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+PYTHON=${PYTHON:-"python"}
+
+CFG=$1
+PY_ARGS=${@:2}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+$PYTHON -u tools/train.py $1 --work_dir $WORK_DIR ${PY_ARGS}
diff --git a/tools/srun_extract.sh b/tools/srun_extract.sh
new file mode 100644
index 00000000..44e2e029
--- /dev/null
+++ b/tools/srun_extract.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+CFG=$2
+CHECKPOINT=$3
+GPUS=${4:-8}
+PY_ARGS=${@:5}
+JOB_NAME="openselfsup"
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/extract.py $CFG \
+        --layer-ind "0,1,2,3,4" --checkpoint $CHECKPOINT \
+        --work_dir $WORK_DIR --launcher="slurm" ${PY_ARGS}
diff --git a/tools/srun_train.sh b/tools/srun_train.sh
new file mode 100755
index 00000000..acaa3ac4
--- /dev/null
+++ b/tools/srun_train.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+CFG=$2
+GPUS=${3:-8}
+PY_ARGS=${@:4}
+JOB_NAME="openselfsup"
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CFG} \
+        --work_dir ${WORK_DIR} --seed 0 --launcher="slurm" ${PY_ARGS}
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 00000000..22cef51b
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,123 @@
+import argparse
+import importlib
+import os
+import os.path as osp
+import time
+
+import mmcv
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+
+from openselfsup.datasets import build_dataloader, build_dataset
+from openselfsup.models import build_model
+from openselfsup.utils import get_root_logger, dist_forward_collect, nondist_forward_collect
+
+
+def single_gpu_test(model, data_loader):
+    model.eval()
+    func = lambda **x: model(mode='test', **x)
+    results = nondist_forward_collect(func, data_loader,
+                                      len(data_loader.dataset))
+    return results
+
+
+def multi_gpu_test(model, data_loader):
+    model.eval()
+    func = lambda **x: model(mode='test', **x)
+    rank, world_size = get_dist_info()
+    results = dist_forward_collect(func, data_loader, rank,
+                                   len(data_loader.dataset))
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work_dir',
+        type=str,
+        default=None,
+        help='the dir to save logs and models')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--port', type=int, default=29500,
+        help='port only works when launcher=="slurm"')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = mmcv.Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # update configs according to CLI args
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+
+    cfg.model.pretrained = None  # ensure to use checkpoint rather than pretraining
+
+    # check memcached package exists
+    if importlib.util.find_spec('mc') is None:
+        for field in ['train', 'val', 'test']:
+            if hasattr(cfg.data, field):
+                getattr(cfg.data, field).data_source.memcached = False
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        if args.launcher == 'slurm':
+            cfg.dist_params['port'] = args.port
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # logger
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, 'test_{}.log'.format(timestamp))
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.val)
+    data_loader = build_dataloader(
+        dataset,
+        imgs_per_gpu=cfg.data.imgs_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_model(cfg.model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader)  # dict{key: np.ndarray}
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        for name, val in outputs.items():
+            dataset.evaluate(
+                torch.from_numpy(val), name, logger, topk=(1, 5))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 00000000..1ff05932
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,142 @@
+from __future__ import division
+import argparse
+import importlib
+import os
+import os.path as osp
+import time
+
+import mmcv
+import torch
+from mmcv import Config
+from mmcv.runner import init_dist
+
+from openselfsup import __version__
+from openselfsup.apis import set_random_seed, train_model
+from openselfsup.datasets import build_dataset
+from openselfsup.models import build_model
+from openselfsup.utils import collect_env, get_root_logger, traverse_replace
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a model')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--work_dir',
+        type=str,
+        default=None,
+        help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume_from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--pretrained', default=None, help='pretrained model file')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--port', type=int, default=29500,
+        help='port only works when launcher=="slurm"')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # update configs according to CLI args
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    cfg.gpus = args.gpus
+
+    # check memcached package exists
+    if importlib.util.find_spec('mc') is None:
+        traverse_replace(cfg, 'memcached', False)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+        assert cfg.model.type not in \
+            ['DeepCluster', 'MOCO', 'SimCLR', 'ODC', 'NPID'], \
+            "{} does not support non-dist training.".format(cfg.model.type)
+    else:
+        distributed = True
+        if args.launcher == 'slurm':
+            cfg.dist_params['port'] = args.port
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, 'train_{}.log'.format(timestamp))
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([('{}: {}'.format(k, v))
+                          for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info('Distributed training: {}'.format(distributed))
+    logger.info('Config:\n{}'.format(cfg.text))
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info('Set random seed to {}, deterministic: {}'.format(
+            args.seed, args.deterministic))
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+
+    if args.pretrained is not None:
+        assert isinstance(args.pretrained, str)
+        cfg.model.pretrained = args.pretrained
+    model = build_model(cfg.model)
+
+    datasets = [build_dataset(cfg.data.train)]
+    assert len(cfg.workflow) == 1, "Validation is called by hook."
+    if cfg.checkpoint_config is not None:
+        # save openselfsup version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            openselfsup_version=__version__, config=cfg.text)
+    # add an attribute for visualization convenience
+    train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/upgrade_models.py b/tools/upgrade_models.py
new file mode 100644
index 00000000..de4c2c5e
--- /dev/null
+++ b/tools/upgrade_models.py
@@ -0,0 +1,27 @@
+import torch
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--save-path', type=str, required=True, help='destination file name')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    ck = torch.load(args.checkpoint, map_location=torch.device('cpu'))
+    output_dict = dict(state_dict=dict(), author='OpenSelfSup')
+    for key, value in ck.items():
+        if key.startswith('head'):
+            continue
+        else:
+            output_dict['state_dict'][key] = value
+    torch.save(output_dict, args.save_path)
+
+
+if __name__ == '__main__':
+    main()