upload code

2020-06-16 00:05:18 +08:00 · 2020-06-16 00:05:18 +08:00 · b742b7c18b
commit b742b7c18b
150 changed files with 10692 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,128 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+mmdet/version.py
+data
+.vscode
+.idea
+
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+work_dirs/
+pretrains
+pretrains/
+
+# Pytorch
+*.pth
+
+*.swp
+source.sh
+tensorboard.sh
+.DS_Store
+replace.sh
+benchmarks/detection/datasets
+benchmarks/detection/output
--- a/.style.yapf
+++ b/.style.yapf
@ -0,0 +1,4 @@
+[style]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
--- a/README.md
+++ b/README.md
@ -0,0 +1,107 @@
+
+# OpenSelfSup
+
+## Introduction
+
+The master branch works with **PyTorch 1.1** or higher.
+
+OpenSelfSup is an open source unsupervised representation learning toolbox based on PyTorch.
+
+### What does this repo do?
+
+Below is the relations among Unsupervised Learning, Self-Supervised Learning and Representation Learning. This repo focuses on the shadow area, i.e., Unsupervised Representation Learning. Self-Supervised Representation Learning is the major branch of it. Since in many cases we do not distingush between Self-Supervised Representation Learning and Unsupervised Representation Learning strictly, we still name this repo as `OpenSelfSup`.
+
+<img src="docs/relation.jpg" width="600"/>
+
+### Major features
+
+- **All methods in one repository**
+  
+|                                                                                                                                                       |  Support |
+|-------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
+| [ImageNet](https://link.springer.com/article/10.1007/s11263-015-0816-y?sa_campaign=email/event/articleAuthor/onlineFirst#)                            |     ✓    |
+| [Relative-Loc](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.pdf)      |     ✓    |
+| [Rotation-Pred](https://arxiv.org/abs/1803.07728)                                                                                                     |     ✓    |
+| [DeepCluster](https://arxiv.org/abs/1807.05520)                                                                                                       |     ✓    |
+| [ODC](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhan_Online_Deep_Clustering_for_Unsupervised_Representation_Learning_CVPR_2020_paper.pdf) |     ✓    |
+| [NIPD](https://arxiv.org/abs/1805.01978)                                                                                                              |     ✓    |
+| [MoCo](https://arxiv.org/abs/1911.05722)                                                                                                              |     ✓    |
+| [MoCo v2](https://arxiv.org/abs/2003.04297)                                                                                                           |     ✓    |
+| [SimCLR](https://arxiv.org/abs/2002.05709)                                                                                                            |     ✓    |
+| [PIRL](http://openaccess.thecvf.com/content_CVPR_2020/papers/Misra_Self-Supervised_Learning_of_Pretext-Invariant_Representations_CVPR_2020_paper.pdf) | progress |
+
+- **Flexibility & Extensibility**
+
+OpenSelfSup follows a similar code architecture of MMDetection while is even more flexible than MMDetection, since OpenSelfSup integrates various self-supervised tasks including classification, joint clustering and feature learning, contrastive learning, tasks with a memory bank, etc.
+
+For existing methods in this repo, you only need to modify config files to adjust hyper-parameters. It is also simple to design your own methods, please refer to [GETTING_STARTED](docs/GETTING_STARTED.md).
+
+- **Efficiency**
+
+  All methods support multi-machine multi-gpu distributed training.
+
+- **Standardized Benchmarks**
+
+  We standardize the benchmarks including logistic regression, SVM / Low-shot SVM from linearly probed features, semi-supervised classification, and object detection. Below are the setting of these benchmarks.
+
+| Benchmarks                       | Setting                                                                                                                                                                     | Difference                                      |
+|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|
+| ImageNet Linear Classification   | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Total 90 epochs, decay at [30, 60].             |
+| Places205 Linear Classification  | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Total 90 epochs, decay at [30, 60].             |
+| PASCAL VOC07 SVM                 | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Costs="1.0,10.0,100.0" to save evaluation time. |
+| PASCAL VOC07 Low-shot SVM        | [goyal2019scaling](http://openaccess.thecvf.com/content_ICCV_2019/papers/Goyal_Scaling_and_Benchmarking_Self-Supervised_Visual_Representation_Learning_ICCV_2019_paper.pdf) | Costs="1.0,10.0,100.0" to save evaluation time. |
+| PASCAL VOC07+12 Object Detection | [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf)                      |                                                 |
+| COCO17 Object Detection          | [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf)                      |                                                 |
+
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Changelog
+
+v0.1.0 was released in 15/06/2020.
+Please refer to [CHANGELOG.md](docs/CHANGELOG.md) for details and release history.
+
+## Benchmark and model zoo
+
+## Installation
+
+Please refer to [INSTALL.md](docs/INSTALL.md) for installation and dataset preparation.
+
+
+## Get Started
+
+Please see [GETTING_STARTED.md](docs/GETTING_STARTED.md) for the basic usage of OpenSelfSup.
+
+## Contributing
+
+We appreciate all contributions to improve MMDetection. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+
+## Citation
+
+If you use this toolbox or benchmark in your research, please cite this project.
+
+```
+@article{openselfsup,
+  title   = {{OpenSelfSup}: Open MMLab Self-Supervised Learning Toolbox and Benchmark},
+  author  = {Xiaohang Zhan, Jiahao Xie, Ziwei Liu, Dahua Lin, Chen Change Loy},
+  howpublished = {\url{https://github.com/open-mmlab/openselfsup}},
+  year = {2020}
+}
+```
+
+## Acknowledgement
+
+1. This repo borrows the architecture design and part of the code from [MMDetection](https://github.com/open-mmlab/mmdetection).
+
+2. The implementation of MoCo and the detection benchmark borrow the code from [moco](https://github.com/facebookresearch/moco).
+
+3. The SVM benchmark borrows the code from [
+fair_self_supervision_benchmark](https://github.com/facebookresearch/fair_self_supervision_benchmark).
+
+4. `openselfsup/third_party/clustering.py` is borrowed from [deepcluster](https://github.com/facebookresearch/deepcluster/blob/master/clustering.py).
+
+## Contact
+
+This repo is currently maintained by Xiaohang Zhan ([@XiaohangZhan](http://github.com/XiaohangZhan)).
--- a/benchmarks/detection/README.md
+++ b/benchmarks/detection/README.md
@ -0,0 +1,12 @@
+
+## Transferring to Detection
+
+We follow the evaluation setting in MoCo when trasferring to object detection.
+
+### Instruction
+
+1. Install [detectron2](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md).
+
+1. Put dataset under "benchmarks/detection/datasets" directory,
+   following the [directory structure](https://github.com/facebookresearch/detectron2/tree/master/datasets)
+	 requried by detectron2.
--- a/benchmarks/detection/configs/Base-RCNN-C4-BN.yaml
+++ b/benchmarks/detection/configs/Base-RCNN-C4-BN.yaml
@ -0,0 +1,17 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RPN:
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "Res5ROIHeadsExtraNorm"
+  BACKBONE:
+    FREEZE_AT: 0
+  RESNETS:
+    NORM: "SyncBN"
+TEST:
+  PRECISE_BN:
+    ENABLED: True
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
--- a/benchmarks/detection/configs/coco_R_50_C4_2x.yaml
+++ b/benchmarks/detection/configs/coco_R_50_C4_2x.yaml
@ -0,0 +1,13 @@
+_BASE_: "Base-RCNN-C4-BN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
--- a/benchmarks/detection/configs/coco_R_50_C4_2x_moco.yaml
+++ b/benchmarks/detection/configs/coco_R_50_C4_2x_moco.yaml
@ -0,0 +1,10 @@
+_BASE_: "coco_R_50_C4_2x.yaml"
+MODEL:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  WEIGHTS: "See Instructions"
+  RESNETS:
+    STRIDE_IN_1X1: False
+INPUT:
+  MAX_SIZE_TRAIN: 1200
+  FORMAT: "RGB"
--- a/benchmarks/detection/configs/pascal_voc_R_50_C4_24k.yaml
+++ b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k.yaml
@ -0,0 +1,16 @@
+_BASE_: "Base-RCNN-C4-BN.yaml"
+MODEL:
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (18000, 22000)
+  MAX_ITER: 24000
+  WARMUP_ITERS: 100
--- a/benchmarks/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml
+++ b/benchmarks/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml
@ -0,0 +1,9 @@
+_BASE_: "pascal_voc_R_50_C4_24k.yaml"
+MODEL:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  WEIGHTS: "See Instructions"
+  RESNETS:
+    STRIDE_IN_1X1: False
+INPUT:
+  FORMAT: "RGB"
--- a/benchmarks/detection/convert-pretrain-to-detectron2.py
+++ b/benchmarks/detection/convert-pretrain-to-detectron2.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pickle as pkl
+import sys
+import torch
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+
+    obj = torch.load(input, map_location="cpu")
+    obj = obj["state_dict"]
+
+    newmodel = {}
+    for k, v in obj.items():
+        old_k = k
+        if "layer" not in k:
+            k = "stem." + k
+        for t in [1, 2, 3, 4]:
+            k = k.replace("layer{}".format(t), "res{}".format(t + 1))
+        for t in [1, 2, 3]:
+            k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
+        k = k.replace("downsample.0", "shortcut")
+        k = k.replace("downsample.1", "shortcut.norm")
+        print(old_k, "->", k)
+        newmodel[k] = v.numpy()
+
+    res = {
+        "model": newmodel,
+        "__author__": "OpenSelfSup",
+        "matching_heuristics": True
+    }
+
+    assert sys.argv[2].endswith('.pkl')
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
--- a/benchmarks/detection/run.sh
+++ b/benchmarks/detection/run.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+DET_CFG=$1
+WEIGHTS=$2
+
+python $(dirname "$0")/train_net.py --config-file $DET_CFG \
+    --num-gpus 8 MODEL.WEIGHTS $WEIGHTS
--- a/benchmarks/detection/train_net.py
+++ b/benchmarks/detection/train_net.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import os
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, PascalVOCDetectionEvaluator
+from detectron2.layers import get_norm
+from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
+
+
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeadsExtraNorm(Res5ROIHeads):
+    """
+    As described in the MOCO paper, there is an extra BN layer
+    following the res5 stage.
+    """
+
+    def _build_res5_block(self, cfg):
+        seq, out_channels = super()._build_res5_block(cfg)
+        norm = cfg.MODEL.RESNETS.NORM
+        norm = get_norm(norm, out_channels)
+        seq.add_module("norm", norm)
+        return seq, out_channels
+
+
+class Trainer(DefaultTrainer):
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        if "coco" in dataset_name:
+            return COCOEvaluator(dataset_name, cfg, True, output_folder)
+        else:
+            assert "voc" in dataset_name
+            return PascalVOCDetectionEvaluator(dataset_name)
+
+
+def setup(args):
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(
+            model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+                cfg.MODEL.WEIGHTS, resume=args.resume)
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args, ),
+    )
--- a/benchmarks/dist_test_cls.sh
+++ b/benchmarks/dist_test_cls.sh
@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+CFG=$1
+EPOCH=$2
+DATASET=$3 # imagenet or places205
+GPUS=${GPUS:-1}
+PORT=${PORT:-29500}
+PY_ARGS=${@:4}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/${DATASET}_at_epoch_${EPOCH}/
+
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    tools/train.py \
+    configs/linear_classifier/${DATASET}/r50_multihead.py \
+    --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+    --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="pytorch" ${PY_ARGS}
+
+# test
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    tools/test.py \
+    configs/linear_classifier/${DATASET}/r50_multihead.py \
+    ${WORK_DIR_EVAL}/latest.pth \
+    --work_dir ${WORK_DIR_EVAL} --launcher="pytorch"
--- a/benchmarks/dist_test_svm.sh
+++ b/benchmarks/dist_test_svm.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+set -x
+
+CFG=$1
+EPOCH=$2
+FEAT_LIST=$3
+GPUS=$4
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+bash tools/dist_extract.sh $CFG $WORK_DIR/epoch_${EPOCH}.pth $GPUS
+
+bash benchmarks/eval_svm.sh $WORK_DIR $FEAT_LIST
+
+bash benchmarks/eval_svm_lowshot.sh $WORK_DIR $FEAT_LIST
--- a/benchmarks/eval_svm.sh
+++ b/benchmarks/eval_svm.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+WORK_DIR=$1
+FEAT_LIST=${2:-"feat5"} # "feat1 feat2 feat3 feat4 feat5"
+TRAIN_SVM_FLAG=true
+TEST_SVM_FLAG=true
+DATA="data/VOCdevkit/VOC2007/SVMLabels"
+
+# config svm
+costs="1.0,10.0,100.0"
+
+mkdir $WORK_DIR/logs
+for feat in $FEAT_LIST; do
+    echo "For feature: $feat" 2>&1 | tee -a $WORK_DIR/logs/eval_svm.log
+    # train svm
+    if $TRAIN_SVM_FLAG; then
+        rm -rf $WORK_DIR/svm
+        mkdir -p $WORK_DIR/svm/voc07_${feat}
+        echo "training svm ..."
+        python benchmarks/svm_tools/train_svm_kfold_parallel.py \
+            --data_file $WORK_DIR/features/voc07_trainval_${feat}.npy \
+            --targets_data_file $DATA/train_labels.npy \
+            --costs_list $costs \
+            --output_path $WORK_DIR/svm/voc07_${feat}
+    fi
+    
+    # test svm
+    if $TEST_SVM_FLAG; then
+        echo "testing svm ..."
+        python benchmarks/svm_tools/test_svm.py \
+            --data_file $WORK_DIR/features/voc07_test_${feat}.npy \
+            --json_targets $DATA/test_targets.json \
+            --targets_data_file $DATA/test_labels.npy \
+            --costs_list $costs \
+            --generate_json 1 \
+            --output_path $WORK_DIR/svm/voc07_${feat} 2>&1 | tee -a $WORK_DIR/logs/eval_svm.log
+    fi
+
+done
--- a/benchmarks/eval_svm_lowshot.sh
+++ b/benchmarks/eval_svm_lowshot.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+WORK_DIR=$1
+MODE="full"
+FEAT_LIST=${2:-"feat5"} # "feat1 feat2 feat3 feat4 feat5"
+TRAIN_SVM_LOWSHOT_FLAG=true
+TEST_SVM_LOWSHOT_FLAG=true
+AGGREGATE_FLAG=true
+DATA="data/VOCdevkit/VOC2007/SVMLabels"
+
+# config svm
+costs="1.0,10.0,100.0"
+if [ "$MODE" == "fast" ]; then
+    shots="96"
+else
+    shots="1 2 4 8 16 32 64 96"
+fi
+
+mkdir $WORK_DIR/logs
+for feat in $FEAT_LIST; do
+    echo "For feature: $feat" 2>&1 | tee -a $WORK_DIR/logs/eval_svm_lowshot.log
+    # train lowshot svm
+    if $TRAIN_SVM_LOWSHOT_FLAG; then
+        rm -rf $WORK_DIR/svm_lowshot
+        mkdir -p $WORK_DIR/svm_lowshot/voc07_${feat}
+        echo "training svm low-shot ..."
+        for s in {1..5}; do
+            for k in $shots; do
+                echo -e "\ts${s} k${k}"
+                python benchmarks/svm_tools/train_svm_low_shot.py \
+                    --data_file $WORK_DIR/features/voc07_trainval_${feat}.npy \
+                    --targets_data_file $DATA/low_shot/labels/train_targets_sample${s}_k${k}.npy \
+                    --costs_list $costs \
+                    --output_path $WORK_DIR/svm_lowshot/voc07_${feat}
+            done
+        done
+    fi
+    
+    # test lowshot svm
+    if $TEST_SVM_LOWSHOT_FLAG; then
+        echo "testing svm low-shot ..."
+        python benchmarks/svm_tools/test_svm_low_shot.py \
+            --data_file $WORK_DIR/features/voc07_test_${feat}.npy \
+            --targets_data_file $DATA/test_labels.npy \
+            --json_targets $DATA/test_targets.json \
+            --generate_json 1 \
+            --costs_list $costs \
+            --output_path $WORK_DIR/svm_lowshot/voc07_${feat} \
+            --k_values "${shots// /,}" \
+            --sample_inds "0,1,2,3,4" \
+            --dataset "voc"
+    fi
+    
+    # aggregate testing results
+    if $AGGREGATE_FLAG; then
+        echo "aggregating svm low-shot ..."
+        python benchmarks/svm_tools/aggregate_low_shot_svm_stats.py \
+            --output_path $WORK_DIR/svm_lowshot/voc07_${feat} \
+            --k_values "${shots// /,}" \
+            --sample_inds "0,1,2,3,4" 2>&1 | tee -a $WORK_DIR/logs/eval_svm_lowshot.log
+    fi
+
+done
--- a/benchmarks/extract_info/voc07.py
+++ b/benchmarks/extract_info/voc07.py
@ -0,0 +1,20 @@
+data_source_cfg = dict(type='ImageList', memcached=False, mclient_path=None)
+data_root = "data/VOCdevkit/VOC2007/JPEGImages"
+data_all_list = "data/VOCdevkit/VOC2007/Lists/trainvaltest.txt"
+split_at = [5011]
+split_name = ['voc07_trainval', 'voc07_test']
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+data = dict(
+    imgs_per_gpu=32,
+    workers_per_gpu=2,
+    extract=dict(
+        type="ExtractDataset",
+        data_source=dict(
+            list_file=data_all_list, root=data_root, **data_source_cfg),
+        pipeline=[
+            dict(type='Resize', size=256),
+            dict(type='Resize', size=(224, 224)),
+            dict(type='ToTensor'),
+            dict(type='Normalize', **img_norm_cfg),
+        ]))
--- a/benchmarks/srun_test_cls.sh
+++ b/benchmarks/srun_test_cls.sh
@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+DATASET=$4 # imagenet or places205
+PY_ARGS=${@:5}
+JOB_NAME="openselfsup"
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/${DATASET}_at_epoch_${EPOCH}/
+
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    srun -p ${PARTITION} \
+        python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py \
+        configs/linear_classifier/${DATASET}/r50_multihead.py \
+        --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+        --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="slurm" ${PY_ARGS}
+
+# test
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py \
+        configs/linear_classifier/${DATASET}/r50_multihead.py \
+        ${WORK_DIR_EVAL}/latest.pth \
+        --work_dir ${WORK_DIR_EVAL} --launcher="slurm"
--- a/benchmarks/srun_test_semi.sh
+++ b/benchmarks/srun_test_semi.sh
@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+PERCENT=$4
+PY_ARGS=${@:5}
+JOB_NAME="openselfsup"
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+CHECKPOINT=$WORK_DIR/epoch_${EPOCH}.pth
+WORK_DIR_EVAL=$WORK_DIR/imagenet_semi_${PERCENT}percent_at_epoch_${EPOCH}/
+
+if [ ! "$PERCENT" == "1" ] && [ ! "$PERCENT" == 10 ]; then
+    echo "ERROR: PERCENT must in {1, 10}"
+    exit
+fi
+# extract backbone
+if [ ! -f "${CHECKPOINT::(-4)}_extracted.pth" ]; then
+    srun -p ${PARTITION} \
+        python tools/extract_backbone_weights.py $CHECKPOINT \
+        --save-path ${CHECKPOINT::(-4)}_extracted.pth
+fi
+
+# train
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py \
+        configs/semisup_classification/imagenet_${PERCENT}percent/r50.py \
+        --pretrained ${CHECKPOINT::(-4)}_extracted.pth \
+        --work_dir ${WORK_DIR_EVAL} --seed 0 --launcher="slurm" ${PY_ARGS}
+
+# test
+GLOG_vmodule=MemcachedClient=-1 \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py \
+        configs/semisup_classification/imagenet_${PERCENT}percent/r50.py \
+        ${WORK_DIR_EVAL}/latest.pth \
+        --work_dir ${WORK_DIR_EVAL} --launcher="slurm"
--- a/benchmarks/srun_test_svm.sh
+++ b/benchmarks/srun_test_svm.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+set -x
+
+PARTITION=$1
+CFG=$2
+EPOCH=$3
+FEAT=$4
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+
+bash tools/srun_extract.sh $PARTITION $CFG $WORK_DIR/epoch_${EPOCH}.pth
+
+srun -p $PARTITION bash benchmarks/eval_svm.sh $WORK_DIR $FEAT
+
+srun -p $PARTITION bash benchmarks/eval_svm.sh $WORK_DIR $FEAT
--- a/benchmarks/svm_tools/aggregate_low_shot_svm_stats.py
+++ b/benchmarks/svm_tools/aggregate_low_shot_svm_stats.py
@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Aggregate the stats over various independent samples for low-shot svm training.
+Stats computed: mean, max, min, std
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import sys
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def save_stats(output_dir, stat, output):
+    out_file = os.path.join(output_dir, 'test_ap_{}.npy'.format(stat))
+    logger.info('Saving {} to: {} {}'.format(stat, out_file, output.shape))
+    np.save(out_file, output)
+
+
+def aggregate_stats(opts):
+    k_values = [int(val) for val in opts.k_values.split(",")]
+    sample_inds = [int(val) for val in opts.sample_inds.split(",")]
+    logger.info(
+        'Aggregating stats for k-values: {} and sample_inds: {}'.format(
+            k_values, sample_inds))
+
+    output_mean, output_max, output_min, output_std = [], [], [], []
+    for k_idx in range(len(k_values)):
+        k_low = k_values[k_idx]
+        k_val_output = []
+        for inds in range(len(sample_inds)):
+            sample_idx = sample_inds[inds]
+            file_name = 'test_ap_sample{}_k{}.npy'.format(
+                sample_idx + 1, k_low)
+            filepath = os.path.join(opts.output_path, file_name)
+            if os.path.exists(filepath):
+                k_val_output.append(np.load(filepath, encoding='latin1'))
+            else:
+                logger.info('file does not exist: {}'.format(filepath))
+        # import pdb; pdb.set_trace()
+        k_val_output = np.concatenate(k_val_output, axis=0)
+        k_low_max = np.max(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_min = np.min(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_mean = np.mean(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        k_low_std = np.std(
+            k_val_output, axis=0).reshape(-1, k_val_output.shape[1])
+        output_mean.append(k_low_mean)
+        output_min.append(k_low_min)
+        output_max.append(k_low_max)
+        output_std.append(k_low_std)
+
+    output_mean = np.concatenate(output_mean, axis=0)
+    output_min = np.concatenate(output_min, axis=0)
+    output_max = np.concatenate(output_max, axis=0)
+    output_std = np.concatenate(output_std, axis=0)
+
+    save_stats(opts.output_path, 'mean', output_mean)
+    save_stats(opts.output_path, 'min', output_min)
+    save_stats(opts.output_path, 'max', output_max)
+    save_stats(opts.output_path, 'std', output_std)
+
+    argmax_cls = np.argmax(output_mean, axis=1)
+    argmax_mean, argmax_min, argmax_max, argmax_std = [], [], [], []
+    for idx in range(len(argmax_cls)):
+        argmax_mean.append(100.0 * output_mean[idx, argmax_cls[idx]])
+        argmax_min.append(100.0 * output_min[idx, argmax_cls[idx]])
+        argmax_max.append(100.0 * output_max[idx, argmax_cls[idx]])
+        argmax_std.append(100.0 * output_std[idx, argmax_cls[idx]])
+    for idx in range(len(argmax_max)):
+        logger.info('mean/min/max/std: {} / {} / {} / {}'.format(
+            round(argmax_mean[idx], 2),
+            round(argmax_min[idx], 2),
+            round(argmax_max[idx], 2),
+            round(argmax_std[idx], 2),
+        ))
+    logger.info('All done!!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low shot SVM model test')
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="Numpy file containing test AP result files")
+    parser.add_argument(
+        '--k_values',
+        type=str,
+        default=None,
+        help="Low-shot k-values for svm testing. Comma separated")
+    parser.add_argument(
+        '--sample_inds',
+        type=str,
+        default=None,
+        help="sample_inds for which to test svm. Comma separated")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    aggregate_stats(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/svm_helper.py
+++ b/benchmarks/svm_tools/svm_helper.py
@ -0,0 +1,171 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Helper module for svm training and testing.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import logging
+import numpy as np
+import os
+import sys
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+# Python 2 and python 3 have different floating point precision. The following
+# trick helps keep the backwards compatibility.
+def py2_py3_compatible_cost(cost):
+    return str(float("{:.17f}".format(cost)))
+
+
+def get_svm_train_output_files(cls, cost, output_path):
+    cls_cost = str(cls) + '_cost' + py2_py3_compatible_cost(cost)
+    out_file = os.path.join(output_path, 'cls' + cls_cost + '.pickle')
+    ap_matrix_out_file = os.path.join(output_path,
+                                      'AP_cls' + cls_cost + '.npy')
+    return out_file, ap_matrix_out_file
+
+
+def parse_cost_list(costs):
+    costs_list = [float(cost) for cost in costs.split(",")]
+    start_num, end_num = 4, 20
+    for num in range(start_num, end_num):
+        costs_list.append(0.5**num)
+    return costs_list
+
+
+def normalize_features(features):
+    feats_norm = np.linalg.norm(features, axis=1)
+    features = features / (feats_norm + 1e-5)[:, np.newaxis]
+    return features
+
+
+def load_input_data(data_file, targets_file):
+    # load the features and the targets
+    #logger.info('loading features and targets...')
+    targets = np.load(targets_file, encoding='latin1')
+    features = np.array(np.load(data_file,
+                                encoding='latin1')).astype(np.float64)
+    assert features.shape[0] == targets.shape[0], "Mismatched #images"
+    logger.info('Loaded features: {} and targets: {}'.format(
+        features.shape, targets.shape))
+    return features, targets
+
+
+def calculate_ap(rec, prec):
+    """
+    Computes the AP under the precision recall curve.
+    """
+    rec, prec = rec.reshape(rec.size, 1), prec.reshape(prec.size, 1)
+    z, o = np.zeros((1, 1)), np.ones((1, 1))
+    mrec, mpre = np.vstack((z, rec, o)), np.vstack((z, prec, z))
+    for i in range(len(mpre) - 2, -1, -1):
+        mpre[i] = max(mpre[i], mpre[i + 1])
+
+    indices = np.where(mrec[1:] != mrec[0:-1])[0] + 1
+    ap = 0
+    for i in indices:
+        ap = ap + (mrec[i] - mrec[i - 1]) * mpre[i]
+    return ap
+
+
+def get_precision_recall(targets, preds):
+    """
+    [P, R, score, ap] = get_precision_recall(targets, preds)
+    Input    :
+        targets  : number of occurrences of this class in the ith image
+        preds    : score for this image
+    Output   :
+        P, R   : precision and recall
+        score  : score which corresponds to the particular precision and recall
+        ap     : average precision
+    """
+    # binarize targets
+    targets = np.array(targets > 0, dtype=np.float32)
+    tog = np.hstack((targets[:, np.newaxis].astype(np.float64),
+                     preds[:, np.newaxis].astype(np.float64)))
+    ind = np.argsort(preds)
+    ind = ind[::-1]
+    score = np.array([tog[i, 1] for i in ind])
+    sortcounts = np.array([tog[i, 0] for i in ind])
+
+    tp = sortcounts
+    fp = sortcounts.copy()
+    for i in range(sortcounts.shape[0]):
+        if sortcounts[i] >= 1:
+            fp[i] = 0.
+        elif sortcounts[i] < 1:
+            fp[i] = 1.
+    P = np.cumsum(tp) / (np.cumsum(tp) + np.cumsum(fp))
+    numinst = np.sum(targets)
+    R = np.cumsum(tp) / numinst
+    ap = calculate_ap(R, P)
+    return P, R, score, ap
+
+
+def get_low_shot_output_file(opts, cls, cost, suffix):
+    # in case of low-shot training, we train for 5 independent samples
+    # (sample{}) and vary low-shot amount (k{}). The input data should have
+    # sample{}_k{} information that we extract in suffix below.
+    # logger.info('Suffix: {}'.format(suffix))
+    cls_cost = str(cls) + '_cost' + py2_py3_compatible_cost(cost)
+    out_file = os.path.join(opts.output_path,
+                            'cls' + cls_cost + '_' + suffix + '.pickle')
+    return out_file
+
+
+def get_low_shot_svm_classes(targets, dataset):
+    # classes for which SVM testing should be done
+    num_classes, cls_list = None, None
+    if dataset == 'voc':
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+    elif dataset == 'places':
+        # each image in places has a target cls [0, .... ,204]
+        num_classes = len(set(targets[:, 0].tolist()))
+        cls_list = list(set(targets[:, 0].tolist()))
+    else:
+        logger.info('Dataset not recognized. Abort!')
+    logger.info('Testing SVM for classes: {}'.format(cls_list))
+    logger.info('Num classes: {}'.format(num_classes))
+    return num_classes, cls_list
+
+
+def get_cls_feats_labels(cls, features, targets, dataset):
+    out_feats, out_cls_labels = None, None
+    if dataset == 'voc':
+        cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        # find the indices for positive/negative imgs. Remove the ignore label.
+        out_data_inds = (targets[:, cls] != -1)
+        out_feats = features[out_data_inds]
+        out_cls_labels = cls_labels[out_data_inds]
+        # label 0 = not present, set it to -1 as svm train target.
+        # Make the svm train target labels as -1, 1.
+        out_cls_labels[np.where(out_cls_labels == 0)] = -1
+    elif dataset == 'places':
+        out_feats = features
+        out_cls_labels = targets.astype(dtype=np.int32, copy=True)
+        # for the given class, get the relevant positive/negative images and
+        # make the label 1, -1
+        cls_inds = np.where(targets[:, 0] == cls)
+        non_cls_inds = (targets[:, 0] != cls)
+        out_cls_labels[non_cls_inds] = -1
+        out_cls_labels[cls_inds] = 1
+        # finally reshape into the format taken by sklearn svm package.
+        out_cls_labels = out_cls_labels.reshape(-1)
+    else:
+        raise Exception('args.dataset not recognized')
+    return out_feats, out_cls_labels
--- a/benchmarks/svm_tools/test_svm.py
+++ b/benchmarks/svm_tools/test_svm.py
@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM test for image classification.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import numpy as np
+import os
+import pickle
+import six
+import sys
+
+import svm_helper
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def get_chosen_costs(opts, num_classes):
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    train_ap_matrix = np.zeros((num_classes, len(costs_list)))
+    for cls in range(num_classes):
+        for cost_idx in range(len(costs_list)):
+            cost = costs_list[cost_idx]
+            _, ap_out_file = svm_helper.get_svm_train_output_files(
+                cls, cost, opts.output_path)
+            train_ap_matrix[cls][cost_idx] = float(
+                np.load(ap_out_file, encoding='latin1')[0])
+    argmax_cls = np.argmax(train_ap_matrix, axis=1)
+    chosen_cost = [costs_list[idx] for idx in argmax_cls]
+    logger.info('chosen_cost: {}'.format(chosen_cost))
+    np.save(
+        os.path.join(opts.output_path, 'crossval_ap.npy'),
+        np.array(train_ap_matrix))
+    np.save(
+        os.path.join(opts.output_path, 'chosen_cost.npy'),
+        np.array(chosen_cost))
+    logger.info('saved crossval_ap AP to file: {}'.format(
+        os.path.join(opts.output_path, 'crossval_ap.npy')))
+    logger.info('saved chosen costs to file: {}'.format(
+        os.path.join(opts.output_path, 'chosen_cost.npy')))
+    return np.array(chosen_cost)
+
+
+def load_json(file_path):
+    assert os.path.exists(file_path), "{} does not exist".format(file_path)
+    with open(file_path, 'r') as fp:
+        data = json.load(fp)
+    img_ids = list(data.keys())
+    cls_names = list(data[img_ids[0]].keys())
+    return img_ids, cls_names
+
+
+def test_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    json_predictions, img_ids, cls_names = {}, [], []
+    if opts.generate_json:
+        img_ids, cls_names = load_json(opts.json_targets)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+    num_classes = targets.shape[1]
+    logger.info('Num classes: {}'.format(num_classes))
+
+    # get the chosen cost that maximizes the cross-validation AP per class
+    costs_list = get_chosen_costs(opts, num_classes)
+
+    ap_matrix = np.zeros((num_classes, 1))
+    for cls in range(num_classes):
+        cost = costs_list[cls]
+        logger.info('Testing model for cls: {} cost: {}'.format(cls, cost))
+        model_file = os.path.join(
+            opts.output_path,
+            'cls' + str(cls) + '_cost' + str(cost) + '.pickle')
+        with open(model_file, 'rb') as fopen:
+            if six.PY2:
+                model = pickle.load(fopen)
+            else:
+                model = pickle.load(fopen, encoding='latin1')
+        prediction = model.decision_function(features)
+        if opts.generate_json:
+            cls_name = cls_names[cls]
+            for idx in range(len(prediction)):
+                img_id = img_ids[idx]
+                if img_id in json_predictions:
+                    json_predictions[img_id][cls_name] = prediction[idx]
+                else:
+                    out_lbl = {}
+                    out_lbl[cls_name] = prediction[idx]
+                    json_predictions[img_id] = out_lbl
+
+        cls_labels = targets[:, cls]
+        # meaning of labels in VOC/COCO original loaded target files:
+        # label 0 = not present, set it to -1 as svm train target
+        # label 1 = present. Make the svm train target labels as -1, 1.
+        evaluate_data_inds = (targets[:, cls] != -1)
+        eval_preds = prediction[evaluate_data_inds]
+        eval_cls_labels = cls_labels[evaluate_data_inds]
+        eval_cls_labels[np.where(eval_cls_labels == 0)] = -1
+        P, R, score, ap = svm_helper.get_precision_recall(
+            eval_cls_labels, eval_preds)
+        ap_matrix[cls][0] = ap
+    if opts.generate_json:
+        output_file = os.path.join(opts.output_path, 'json_preds.json')
+        with open(output_file, 'w') as fp:
+            json.dump(json_predictions, fp)
+        logger.info('Saved json predictions to: {}'.format(output_file))
+    logger.info('Mean AP: {}'.format(np.mean(ap_matrix, axis=0)))
+    np.save(os.path.join(opts.output_path, 'test_ap.npy'), np.array(ap_matrix))
+    logger.info('saved test AP to file: {}'.format(
+        os.path.join(opts.output_path, 'test_ap.npy')))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model test')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features and labels")
+    parser.add_argument(
+        '--json_targets',
+        type=str,
+        default=None,
+        help="Json file containing json targets")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where trained SVM models are saved")
+    parser.add_argument(
+        '--generate_json',
+        type=int,
+        default=0,
+        help="Whether to generate json files for output")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    test_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/test_svm_low_shot.py
+++ b/benchmarks/svm_tools/test_svm_low_shot.py
@ -0,0 +1,212 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM test for low shot image classification.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import numpy as np
+import os
+import pickle
+import six
+import sys
+
+import svm_helper
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def load_json(file_path):
+    assert os.path.exists(file_path), "{} does not exist".format(file_path)
+    with open(file_path, 'r') as fp:
+        data = json.load(fp)
+    img_ids = list(data.keys())
+    cls_names = list(data[img_ids[0]].keys())
+    return img_ids, cls_names
+
+
+def save_json_predictions(opts, cost, sample_idx, k_low, features, cls_list,
+                          cls_names, img_ids):
+    num_classes = len(cls_list)
+    json_predictions = {}
+    for cls in range(num_classes):
+        suffix = 'sample{}_k{}'.format(sample_idx + 1, k_low)
+        model_file = svm_helper.get_low_shot_output_file(
+            opts, cls, cost, suffix)
+        with open(model_file, 'rb') as fopen:
+            if six.PY2:
+                model = pickle.load(fopen)
+            else:
+                model = pickle.load(fopen, encoding='latin1')
+        prediction = model.decision_function(features)
+        cls_name = cls_names[cls]
+        for idx in range(len(prediction)):
+            img_id = img_ids[idx]
+            if img_id in json_predictions:
+                json_predictions[img_id][cls_name] = prediction[idx]
+            else:
+                out_lbl = {}
+                out_lbl[cls_name] = prediction[idx]
+                json_predictions[img_id] = out_lbl
+
+    output_file = os.path.join(opts.output_path,
+                               'test_{}_json_preds.json'.format(suffix))
+    with open(output_file, 'w') as fp:
+        json.dump(json_predictions, fp)
+    logger.info('Saved json predictions to: {}'.format(output_file))
+
+
+def test_svm_low_shot(opts):
+    k_values = [int(val) for val in opts.k_values.split(",")]
+    sample_inds = [int(val) for val in opts.sample_inds.split(",")]
+    logger.info('Testing svm for k-values: {} and sample_inds: {}'.format(
+        k_values, sample_inds))
+
+    img_ids, cls_names = [], []
+    if opts.generate_json:
+        img_ids, cls_names = load_json(opts.json_targets)
+
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    # we test the svms on the full test set. Given the test features and the
+    # targets, we test it for various k-values (low-shot), cost values and
+    # 5 independent samples.
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    logger.info('Testing SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    # create the output for per sample, per k-value and per cost.
+    sample_ap_matrices = []
+    for _ in range(len(sample_inds)):
+        ap_matrix = np.zeros((len(k_values), len(costs_list)))
+        sample_ap_matrices.append(ap_matrix)
+
+    # the test goes like this: For a given sample, for a given k-value and a
+    # given cost value, we evaluate the trained svm model for all classes.
+    # After computing over all classes, we get the mean AP value over all
+    # classes. We hence end up with: output = [sample][k_value][cost]
+    for inds in range(len(sample_inds)):
+        sample_idx = sample_inds[inds]
+        for k_idx in range(len(k_values)):
+            k_low = k_values[k_idx]
+            suffix = 'sample{}_k{}'.format(sample_idx + 1, k_low)
+            for cost_idx in range(len(costs_list)):
+                cost = costs_list[cost_idx]
+                local_cost_ap = np.zeros((num_classes, 1))
+                for cls in cls_list:
+                    logger.info(
+                        'Test sample/k_value/cost/cls: {}/{}/{}/{}'.format(
+                            sample_idx + 1, k_low, cost, cls))
+                    model_file = svm_helper.get_low_shot_output_file(
+                        opts, cls, cost, suffix)
+                    with open(model_file, 'rb') as fopen:
+                        if six.PY2:
+                            model = pickle.load(fopen)
+                        else:
+                            model = pickle.load(fopen, encoding='latin1')
+                    prediction = model.decision_function(features)
+                    eval_preds, eval_cls_labels = svm_helper.get_cls_feats_labels(
+                        cls, prediction, targets, opts.dataset)
+                    P, R, score, ap = svm_helper.get_precision_recall(
+                        eval_cls_labels, eval_preds)
+                    local_cost_ap[cls][0] = ap
+                mean_cost_ap = np.mean(local_cost_ap, axis=0)
+                sample_ap_matrices[inds][k_idx][cost_idx] = mean_cost_ap
+            out_k_sample_file = os.path.join(
+                opts.output_path,
+                'test_ap_sample{}_k{}.npy'.format(sample_idx + 1, k_low))
+            save_data = sample_ap_matrices[inds][k_idx]
+            save_data = save_data.reshape((1, -1))
+            np.save(out_k_sample_file, save_data)
+            logger.info('Saved sample test k_idx AP to file: {} {}'.format(
+                out_k_sample_file, save_data.shape))
+            if opts.generate_json:
+                argmax_cls = np.argmax(save_data, axis=1)
+                chosen_cost = costs_list[argmax_cls[0]]
+                logger.info('chosen cost: {}'.format(chosen_cost))
+                save_json_predictions(opts, chosen_cost, sample_idx, k_low,
+                                      features, cls_list, cls_names, img_ids)
+    logger.info('All done!!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low shot SVM model test')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features and labels")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--json_targets',
+        type=str,
+        default=None,
+        help="Numpy file containing json targets")
+    parser.add_argument(
+        '--generate_json',
+        type=int,
+        default=0,
+        help="Whether to generate json files for output")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default=
+        "0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1.0,10.0,100.0",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where trained SVM models are saved")
+    parser.add_argument(
+        '--k_values',
+        type=str,
+        default="1,2,4,8,16,32,64,96",
+        help="Low-shot k-values for svm testing. Comma separated")
+    parser.add_argument(
+        '--sample_inds',
+        type=str,
+        default="0,1,2,3,4",
+        help="sample_inds for which to test svm. Comma separated")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    logger.info(opts)
+    test_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/train_svm_kfold.py
+++ b/benchmarks/svm_tools/train_svm_kfold.py
@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM training using 3-fold cross-validation.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from tqdm import tqdm
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score
+
+import svm_helper
+
+import time
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def train_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    #logger.info('Training SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM training should be done
+    if opts.cls_list:
+        cls_list = [int(cls) for cls in opts.cls_list.split(",")]
+    else:
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+    #logger.info('Training SVM for classes: {}'.format(cls_list))
+
+    for cls_idx in tqdm(range(len(cls_list))):
+        cls = cls_list[cls_idx]
+        for cost_idx in range(len(costs_list)):
+            start = time.time()
+            cost = costs_list[cost_idx]
+            out_file, ap_out_file = svm_helper.get_svm_train_output_files(
+                cls, cost, opts.output_path)
+            if os.path.exists(out_file) and os.path.exists(ap_out_file):
+                logger.info('SVM model exists: {}'.format(out_file))
+                logger.info('AP file exists: {}'.format(ap_out_file))
+            else:
+                #logger.info('Training model with the cost: {}'.format(cost))
+                clf = LinearSVC(
+                    C=cost,
+                    class_weight={
+                        1: 2,
+                        -1: 1
+                    },
+                    intercept_scaling=1.0,
+                    verbose=0,
+                    penalty='l2',
+                    loss='squared_hinge',
+                    tol=0.0001,
+                    dual=True,
+                    max_iter=2000,
+                )
+                cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+                # meaning of labels in VOC/COCO original loaded target files:
+                # label 0 = not present, set it to -1 as svm train target
+                # label 1 = present. Make the svm train target labels as -1, 1.
+                cls_labels[np.where(cls_labels == 0)] = -1
+                #num_positives = len(np.where(cls_labels == 1)[0])
+                #num_negatives = len(cls_labels) - num_positives
+
+                #logger.info('cls: {} has +ve: {} -ve: {} ratio: {}'.format(
+                #    cls, num_positives, num_negatives,
+                #    float(num_positives) / num_negatives)
+                #)
+                #logger.info('features: {} cls_labels: {}'.format(
+                #    features.shape, cls_labels.shape))
+                ap_scores = cross_val_score(
+                    clf,
+                    features,
+                    cls_labels,
+                    cv=3,
+                    scoring='average_precision')
+                clf.fit(features, cls_labels)
+
+                #logger.info('cls: {} cost: {} AP: {} mean:{}'.format(
+                #    cls, cost, ap_scores, ap_scores.mean()))
+                #logger.info('Saving cls cost AP to: {}'.format(ap_out_file))
+                np.save(ap_out_file, np.array([ap_scores.mean()]))
+                #logger.info('Saving SVM model to: {}'.format(out_file))
+                with open(out_file, 'wb') as fwrite:
+                    pickle.dump(clf, fwrite)
+            print("time: {:.4g} s".format(time.time() - start))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+
+    parser.add_argument(
+        '--cls_list',
+        type=str,
+        default=None,
+        help="comma separated string list of classes to train")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    #logger.info(opts)
+    train_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/train_svm_kfold_parallel.py
+++ b/benchmarks/svm_tools/train_svm_kfold_parallel.py
@ -0,0 +1,151 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+SVM training using 3-fold cross-validation.
+
+Relevant transfer tasks: Image Classification VOC07 and COCO2014.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import multiprocessing as mp
+import tqdm
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score
+
+import svm_helper
+
+import pdb
+
+
+def task(cls, cost, opts, features, targets):
+    out_file, ap_out_file = svm_helper.get_svm_train_output_files(
+        cls, cost, opts.output_path)
+    if not (os.path.exists(out_file) and os.path.exists(ap_out_file)):
+        clf = LinearSVC(
+            C=cost,
+            class_weight={
+                1: 2,
+                -1: 1
+            },
+            intercept_scaling=1.0,
+            verbose=0,
+            penalty='l2',
+            loss='squared_hinge',
+            tol=0.0001,
+            dual=True,
+            max_iter=2000,
+        )
+        cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        cls_labels[np.where(cls_labels == 0)] = -1
+        ap_scores = cross_val_score(
+            clf, features, cls_labels, cv=3, scoring='average_precision')
+        clf.fit(features, cls_labels)
+        np.save(ap_out_file, np.array([ap_scores.mean()]))
+        with open(out_file, 'wb') as fwrite:
+            pickle.dump(clf, fwrite)
+    return 0
+
+
+def mp_helper(args):
+    return task(*args)
+
+
+def train_svm(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+
+    # classes for which SVM training should be done
+    if opts.cls_list:
+        cls_list = [int(cls) for cls in opts.cls_list.split(",")]
+    else:
+        num_classes = targets.shape[1]
+        cls_list = range(num_classes)
+
+    num_task = len(cls_list) * len(costs_list)
+    args_cls = []
+    args_cost = []
+    for cls in cls_list:
+        for cost in costs_list:
+            args_cls.append(cls)
+            args_cost.append(cost)
+    args_opts = [opts] * num_task
+    args_features = [features] * num_task
+    args_targets = [targets] * num_task
+
+    pool = mp.Pool(mp.cpu_count())
+    for _ in tqdm.tqdm(
+            pool.imap_unordered(
+                mp_helper,
+                zip(args_cls, args_cost, args_opts, args_features,
+                    args_targets)),
+            total=num_task):
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(description='SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+
+    parser.add_argument(
+        '--cls_list',
+        type=str,
+        default=None,
+        help="comma separated string list of classes to train")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    train_svm(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/train_svm_low_shot.py
+++ b/benchmarks/svm_tools/train_svm_low_shot.py
@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Low Shot SVM training.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+from tqdm import tqdm
+
+import svm_helper
+
+import time
+
+# create the logger
+FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+
+def train_svm_low_shot(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+    #logger.info('Training SVM for costs: {}'.format(costs_list))
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    for cls in tqdm(cls_list):
+        for cost_idx in range(len(costs_list)):
+            start = time.time()
+            cost = costs_list[cost_idx]
+            suffix = '_'.join(
+                opts.targets_data_file.split('/')[-1].split('.')[0].split('_')
+                [-2:])
+            out_file = svm_helper.get_low_shot_output_file(
+                opts, cls, cost, suffix)
+            if os.path.exists(out_file):
+                logger.info('SVM model exists: {}'.format(out_file))
+            else:
+                #logger.info('SVM model not found: {}'.format(out_file))
+                #logger.info('Training model with the cost: {}'.format(cost))
+                clf = LinearSVC(
+                    C=cost,
+                    class_weight={
+                        1: 2,
+                        -1: 1
+                    },
+                    intercept_scaling=1.0,
+                    verbose=0,
+                    penalty='l2',
+                    loss='squared_hinge',
+                    tol=0.0001,
+                    dual=True,
+                    max_iter=2000,
+                )
+                train_feats, train_cls_labels = svm_helper.get_cls_feats_labels(
+                    cls, features, targets, opts.dataset)
+                #num_positives = len(np.where(train_cls_labels == 1)[0])
+                #num_negatives = len(np.where(train_cls_labels == -1)[0])
+
+                #logger.info('cls: {} has +ve: {} -ve: {} ratio: {}'.format(
+                #    cls, num_positives, num_negatives,
+                #    float(num_positives) / num_negatives)
+                #)
+                #logger.info('features: {} cls_labels: {}'.format(
+                #    train_feats.shape, train_cls_labels.shape))
+                clf.fit(train_feats, train_cls_labels)
+                #logger.info('Saving SVM model to: {}'.format(out_file))
+                with open(out_file, 'wb') as fwrite:
+                    pickle.dump(clf, fwrite)
+            #print("time: {:.4g} s".format(time.time() - start))
+    #logger.info('All done!')
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low-shot SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+
+    #logger.info(opts)
+    train_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/svm_tools/train_svm_low_shot_parallel.py
+++ b/benchmarks/svm_tools/train_svm_low_shot_parallel.py
@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+################################################################################
+"""
+Low Shot SVM training.
+
+Relevant transfer tasks: Low-shot Image Classification VOC07 and Places205 low
+shot samples.
+"""
+
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import multiprocessing as mp
+import tqdm
+import argparse
+import logging
+import numpy as np
+import os
+import pickle
+import sys
+from sklearn.svm import LinearSVC
+
+import svm_helper
+
+import pdb
+
+
+def task(cls, cost, opts, features, targets):
+    suffix = '_'.join(
+        opts.targets_data_file.split('/')[-1].split('.')[0].split('_')[-2:])
+    out_file = svm_helper.get_low_shot_output_file(opts, cls, cost, suffix)
+    if not os.path.exists(out_file):
+        clf = LinearSVC(
+            C=cost,
+            class_weight={
+                1: 2,
+                -1: 1
+            },
+            intercept_scaling=1.0,
+            verbose=0,
+            penalty='l2',
+            loss='squared_hinge',
+            tol=0.0001,
+            dual=True,
+            max_iter=2000,
+        )
+        train_feats, train_cls_labels = svm_helper.get_cls_feats_labels(
+            cls, features, targets, opts.dataset)
+        clf.fit(train_feats, train_cls_labels)
+        #cls_labels = targets[:, cls].astype(dtype=np.int32, copy=True)
+        #cls_labels[np.where(cls_labels == 0)] = -1
+        #clf.fit(features, cls_labels)
+        with open(out_file, 'wb') as fwrite:
+            pickle.dump(clf, fwrite)
+    return 0
+
+
+def mp_helper(args):
+    return task(*args)
+
+
+def train_svm_low_shot(opts):
+    assert os.path.exists(opts.data_file), "Data file not found. Abort!"
+    if not os.path.exists(opts.output_path):
+        os.makedirs(opts.output_path)
+
+    features, targets = svm_helper.load_input_data(opts.data_file,
+                                                   opts.targets_data_file)
+    # normalize the features: N x 9216 (example shape)
+    features = svm_helper.normalize_features(features)
+
+    # parse the cost values for training the SVM on
+    costs_list = svm_helper.parse_cost_list(opts.costs_list)
+
+    # classes for which SVM testing should be done
+    num_classes, cls_list = svm_helper.get_low_shot_svm_classes(
+        targets, opts.dataset)
+
+    num_task = len(cls_list) * len(costs_list)
+    args_cls = []
+    args_cost = []
+    for cls in cls_list:
+        for cost in costs_list:
+            args_cls.append(cls)
+            args_cost.append(cost)
+    args_opts = [opts] * num_task
+    args_features = [features] * num_task
+    args_targets = [targets] * num_task
+
+    pool = mp.Pool(mp.cpu_count())
+    for _ in tqdm.tqdm(
+            pool.imap_unordered(
+                mp_helper,
+                zip(args_cls, args_cost, args_opts, args_features,
+                    args_targets)),
+            total=num_task):
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Low-shot SVM model training')
+    parser.add_argument(
+        '--data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image features")
+    parser.add_argument(
+        '--targets_data_file',
+        type=str,
+        default=None,
+        help="Numpy file containing image labels")
+    parser.add_argument(
+        '--costs_list',
+        type=str,
+        default="0.01,0.1",
+        help="comma separated string containing list of costs")
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default=None,
+        help="path where to save the trained SVM models")
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=100,
+        help="random seed for SVM classifier training")
+    parser.add_argument(
+        '--dataset', type=str, default="voc", help='voc | places')
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    opts = parser.parse_args()
+    train_svm_low_shot(opts)
+
+
+if __name__ == '__main__':
+    main()
--- a/configs/base.py
+++ b/configs/base.py
@ -0,0 +1,18 @@
+train_cfg = {}
+test_cfg = {}
+optimizer_config = dict()  # grad_clip, coalesce, bucket_size_mb
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+dist_params = dict(backend='nccl')
+cudnn_benchmark = True
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/classification/cifar10/r50.py
+++ b/configs/classification/cifar10/r50.py
@ -0,0 +1,59 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 4: stage-4
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=10))
+# dataset settings
+data_source_cfg = dict(type='Cifar10', root='data/cifar/')
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.201])
+train_pipeline = [
+    dict(type='RandomCrop', size=32, padding=4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=128,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(split='train', **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(split='test', **data_source_cfg),
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_source=dict(split='test', **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=8,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005)
+# learning policy
+lr_config = dict(policy='step', step=[150, 250])
+checkpoint_config = dict(interval=50)
+# runtime settings
+total_epochs = 350
--- a/configs/classification/imagnet/r50.py
+++ b/configs/classification/imagnet/r50.py
@ -0,0 +1,68 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
--- a/configs/linear_classification/imagenet/r50_multihead.py
+++ b/configs/linear_classification/imagenet/r50_multihead.py
@ -0,0 +1,89 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    frozen_backbone=True,
+    with_sobel=False,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[0, 1, 2, 3, 4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='MultiClsHead',
+        pool_type='specified',
+        in_indices=[0, 1, 2, 3, 4],
+        with_last_layer_unpool=True,
+        backbone='resnet50',
+        norm_cfg=dict(type='BN', momentum=0.1, affine=False),
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.),
+    dict(type='ToTensor'),
+    dict(type='Lighting'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=256,  # total 256
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=4,
+        eval_param=dict(topk=(1, )))
+]
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_options=dict(norm_decay_mult=0.),
+    nesterov=True)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
--- a/configs/linear_classification/places205/r50_multihead.py
+++ b/configs/linear_classification/places205/r50_multihead.py
@ -0,0 +1,89 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    frozen_backbone=True,
+    with_sobel=False,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[0, 1, 2, 3, 4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    head=dict(
+        type='MultiClsHead',
+        pool_type='specified',
+        in_indices=[0, 1, 2, 3, 4],
+        with_last_layer_unpool=True,
+        backbone='resnet50',
+        norm_cfg=dict(type='BN', momentum=0.1, affine=False),
+        num_classes=205))
+# dataset settings
+data_source_cfg = dict(
+    type='Places205',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/places205/meta/train_labeled.txt'
+data_train_root = 'data/places205/train'
+data_test_list = 'data/places205/meta/val_labeled.txt'
+data_test_root = 'data/places205/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.),
+    dict(type='ToTensor'),
+    dict(type='Lighting'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=256,  # total 256
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=10,
+        imgs_per_gpu=128,
+        workers_per_gpu=4,
+        eval_param=dict(topk=(1, )))
+]
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_options=dict(norm_decay_mult=0.),
+    nesterov=True)
+# learning policy
+lr_config = dict(policy='step', step=[30, 60, 90])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 90
--- a/configs/selfsup/deepcluster/r50.py
+++ b/configs/selfsup/deepcluster/r50.py
@ -0,0 +1,88 @@
+_base_ = '../../base.py'
+# model settings
+num_classes = 10000
+model = dict(
+    type='DeepCluster',
+    pretrained=None,
+    with_sobel=True,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=2,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(type='AvgPoolNeck'),
+    head=dict(
+        type='ClsHead',
+        with_avg_pool=False,  # already has avgpool in the neck
+        in_channels=2048,
+        num_classes=num_classes))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'DeepClusterDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='RandomRotation', degrees=2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=1.0,
+        hue=0.5),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+extract_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=64,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='DeepClusterHook',
+        extractor=dict(
+            imgs_per_gpu=128,
+            workers_per_gpu=8,
+            dataset=dict(
+                type=dataset_type,
+                data_source=dict(
+                    list_file=data_train_list,
+                    root=data_train_root,
+                    **data_source_cfg),
+                pipeline=extract_pipeline)),
+        clustering=dict(type='Kmeans', k=num_classes, pca_dim=256),
+        unif_sampling=True,
+        reweight=False,
+        reweight_pow=0.5,
+        initial=True,  # call initially
+        interval=1)
+]
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001,
+    nesterov=False,
+    paramwise_options={'\Ahead.': dict(momentum=0.)})
+# learning policy
+lr_config = dict(policy='step', step=[400])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 480
--- a/configs/selfsup/moco/r50_v1.py
+++ b/configs/selfsup/moco/r50_v1.py
@ -0,0 +1,59 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='MOCO',
+    pretrained=None,
+    queue_len=65536,
+    feat_dim=128,
+    momentum=0.999,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(
+        type='LinearNeck',
+        in_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.07))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8=256
+    workers_per_gpu=4,
+    drop_last=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9)
+# learning policy
+lr_config = dict(policy='step', step=[120, 160])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
--- a/configs/selfsup/moco/r50_v2.py
+++ b/configs/selfsup/moco/r50_v2.py
@ -0,0 +1,75 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='MOCO',
+    pretrained=None,
+    queue_len=65536,
+    feat_dim=128,
+    momentum=0.999,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='BN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.2))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.4,
+                hue=0.4)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8=256
+    workers_per_gpu=4,
+    drop_last=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9)
+# learning policy
+lr_config = dict(policy='CosineAnealing', min_lr=0.)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
--- a/configs/selfsup/npid/r50.py
+++ b/configs/selfsup/npid/r50.py
@ -0,0 +1,64 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='NPID',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='LinearNeck',
+        in_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.07),
+    memory_bank=dict(
+        type='SimpleMemory', length=1281167, feat_dim=128, momentum=0.5))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'NPIDDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='ColorJitter',
+        brightness=0.4,
+        contrast=0.4,
+        saturation=0.4,
+        hue=0.4),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.03, weight_decay=0.0001, momentum=0.9, nesterov=False)
+# learning policy
+lr_config = dict(policy='step', step=[120, 160])
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
--- a/configs/selfsup/rotation_pred/r50.py
+++ b/configs/selfsup/rotation_pred/r50.py
@ -0,0 +1,64 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='RotationPred',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=4))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'RotationPredDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=16,  # (16*4) x 8 = 512
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0001, nesterov=False)
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=[30, 50],
+    warmup='linear',
+    warmup_iters=5,  # 5 ep
+    warmup_ratio=0.1,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 70
--- a/configs/selfsup/simclr/r50_bs256.py
+++ b/configs/selfsup/simclr/r50_bs256.py
@ -0,0 +1,77 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='SimCLR',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.1))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.8,
+                contrast=0.8,
+                saturation=0.8,
+                hue=0.2)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 32*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='LARS', lr=0.3, weight_decay=0.000001, momentum=0.9)
+# learning policy
+lr_config = dict(
+    policy='CosineAnealing',
+    min_lr=0.,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
--- a/configs/selfsup/simclr/r50_bs512.py
+++ b/configs/selfsup/simclr/r50_bs512.py
@ -0,0 +1,77 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='SimCLR',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    neck=dict(
+        type='NonLinearNeckV1',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(type='ContrastiveHead', temperature=0.1))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=False,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train.txt'
+data_train_root = 'data/imagenet/train'
+dataset_type = 'ContrastiveDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.8,
+                contrast=0.8,
+                saturation=0.8,
+                hue=0.2)
+        ],
+        p=0.8),
+    dict(type='RandomGrayscale', p=0.2),
+    dict(
+        type='RandomAppliedTrans',
+        transforms=[
+            dict(
+                type='GaussianBlur',
+                sigma_min=0.1,
+                sigma_max=2.0,
+                kernel_size=23)
+        ],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=64,  # total 64*8
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline))
+# optimizer
+optimizer = dict(type='LARS', lr=0.6, weight_decay=0.000001, momentum=0.9)
+# learning policy
+lr_config = dict(
+    policy='CosineAnealing',
+    min_lr=0.,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.01,
+    warmup_by_epoch=True)
+checkpoint_config = dict(interval=10)
+# runtime settings
+total_epochs = 200
--- a/configs/semisup_classification/imagenet_10percent/r50.py
+++ b/configs/semisup_classification/imagenet_10percent/r50.py
@ -0,0 +1,69 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled_10percent.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=2,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
+                 paramwise_options={'\Ahead.': dict(lr_mult=10)})
+# learning policy
+lr_config = dict(policy='step', step=[18, 24], gamma=0.2)
+checkpoint_config = dict(interval=2)
+# runtime settings
+total_epochs = 30
--- a/configs/semisup_classification/imagenet_1percent/r50.py
+++ b/configs/semisup_classification/imagenet_1percent/r50.py
@ -0,0 +1,69 @@
+_base_ = '../../base.py'
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN')),
+    head=dict(
+        type='ClsHead', with_avg_pool=True, in_channels=2048,
+        num_classes=1000))
+# dataset settings
+data_source_cfg = dict(
+    type='ImageNet',
+    memcached=True,
+    mclient_path='/mnt/lustre/share/memcached_client')
+data_train_list = 'data/imagenet/meta/train_labeled_1percent.txt'
+data_train_root = 'data/imagenet/train'
+data_test_list = 'data/imagenet/meta/val_labeled.txt'
+data_test_root = 'data/imagenet/val'
+dataset_type = 'ClassificationDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomHorizontalFlip'),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+test_pipeline = [
+    dict(type='Resize', size=256),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg),
+]
+data = dict(
+    imgs_per_gpu=32,  # total 256
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_train_list, root=data_train_root,
+            **data_source_cfg),
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_source=dict(
+            list_file=data_test_list, root=data_test_root, **data_source_cfg),
+        pipeline=test_pipeline))
+# additional hooks
+custom_hooks = [
+    dict(
+        type='ValidateHook',
+        dataset=data['val'],
+        initial=True,
+        interval=2,
+        imgs_per_gpu=32,
+        workers_per_gpu=2,
+        eval_param=dict(topk=(1, 5)))
+]
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005,
+                 paramwise_options={'\Ahead.': dict(lr_mult=100)})
+# learning policy
+lr_config = dict(policy='step', step=[12, 16], gamma=0.2)
+checkpoint_config = dict(interval=2)
+# runtime settings
+total_epochs = 20
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -0,0 +1,2 @@
+## Changelog
+
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@ -0,0 +1,192 @@
+# Getting Started
+
+This page provides basic tutorials about the usage of OpenSelfSup.
+For installation instructions, please see [INSTALL.md](INSTALL.md).
+
+## Train existing methods
+
+**Note**: The default learning rate in config files is for 8 GPUs (except for those under `configs/linear_classification` that use 1 GPU). If using differnt number GPUs, the total batch size will change in proportion, you have to scale the learning rate following `new_lr = old_lr * new_ngpus / old_ngpus`. We recommend to use `tools/dist_train.sh` even with 1 gpu, since some methods do not support non-distributed training.
+
+### Train with single/multiple GPUs
+```shell
+# checkpoints and logs are saved in the same sub-directory as the config file under `work_dirs/` by default.
+bash tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+An example:
+```shell
+bash tools/dist_train.sh configs/selfsup/odc/r50_v1.py 8
+```
+
+Optional arguments are:
+- `--work_dir ${WORK_DIR}`: Override the default working directory.
+- `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file.
+- `--pretrained ${PRETRAIN_WEIGHTS}`: Load pretrained weights for the backbone.
+
+Alternatively, if you run OpenSelfSup on a cluster managed with [slurm](https://slurm.schedmd.com/):
+```shell
+SRUN_ARGS="${SRUN_ARGS}" bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+An example:
+```shell
+SRUN_ARGS="-w xx.xx.xx.xx" bash tools/srun_train.sh Dummy configs/selfsup/odc/r50_v1.py 8
+```
+
+### Launch multiple jobs on a single machine
+
+If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+If you use `dist_train.sh` to launch training jobs:
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 bash tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 bash tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+If you use launch training jobs with slurm:
+```shell
+GPUS_PER_NODE=4 bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} 4 --port 29500
+GPUS_PER_NODE=4 bash tools/srun_train.sh ${PARTITION} ${CONFIG_FILE} 4 --port 29501
+```
+
+## Benchmarks
+
+We provide several standard benchmarks to evaluate representation learning.
+
+### VOC07 Linear SVM & Low-shot Linear SVM
+
+```shell
+bash benchmarks/dist_test_svm.sh ${CONFIG_FILE} ${EPOCH} ${FEAT_LIST} ${GPU_NUM}
+```
+Augments:
+- `${FEAT_LIST}` is a string to specify features from layer1 to layer5 to evaluate; e.g., if you want to evaluate layer5 only, then `FEAT_LIST` is `feat5`, if you want to evaluate all features, then then `FEAT_LIST` is `feat1 feat2 feat3 feat4 feat5` (separated by space).
+- `$GPU_NUM` is the number of GPUs to extract features.
+
+### ImageNet / Places205 Linear Classification
+
+```shell
+bash benchmarks/dist_test_cls.sh ${CONFIG_FILE} ${EPOCH} ${DATASET} [optional arguments]
+```
+Augments:
+- `${DATASET}` in `['imagenet', 'places205']`.
+- Optional arguments include `--resume_from ${CHECKPOINT_FILE}` that resume from a previous checkpoint file.
+
+### VOC07+12 / COCO17 Object Detection
+
+1. First, extract backbone weights:
+
+    ```shell
+    python tools/extract_backbone_weights.py ${CHECKPOINT} --save-path ${WEIGHT_FILE}
+    ```
+    Arguments:
+    - `CHECKPOINTS`: the checkpoint file of a selfsup method named as `epoch_*.pth`.
+    - `WEIGHT_FILE`: the output backbone weights file, e.g., `odc_v1.pth`.
+    
+2. Next, run detection. For more details to setup the environments for detection, please refer [here](benchmarks/detection/README.md).
+```shell
+conda activate detectron2
+cd benchmarks/detection
+python convert-pretrain-to-detectron2.py ${WEIGHT_FILE} ${OUTPUT_FILE} # must use .pkl as the output extension.
+bash run.sh ${DET_CFG} ${OUTPUT_FILE}
+```
+Arguments:
+- `DET_CFG`: the detectron2 config file, usually we use `configs/pascal_voc_R_50_C4_24k_moco.yaml`.
+- `OUTPUT_FILE`: converted backbone weights file, e.g., `odc_v1.pkl`.
+
+**Note**:
+- This benchmark must use 8 GPUs as the default setting from MoCo.
+- Please report the mean of 5 trials in your offical paper, according to MoCo.
+- DeepCluster that uses Sobel layer is not supported by detectron2.
+
+### Publish a model
+
+1. Extract the backbone weights as mentioned before. You don't have to extract it again if you've already done it in the benchmark step.
+
+```shell
+python tools/extract_backbone_weights.py ${CHECKPOINT} --save-path ${WEIGHT_FILE}
+```
+
+2. Compute the hash of the weight file and append the hash id to the filename.
+
+```shell
+python tools/publish_model.py ${WEIGHT_FILE}
+```
+
+## How-to
+
+### Use a new dataset
+
+1. Write a data source file under `openselfsup/datasets/data_sources/`. You may refer to the existing ones.
+
+2. Create new config files for your experiments.
+
+### Design your own methods
+
+#### What you need to do
+
+    1. Create a dataset file under `openselfsup/datasets/` (better using existing ones);
+    2. Create a model file under `openselfsup/models/`. The model typically contains:
+      i) backbone (required): images to deep features from differet depth of layers.
+      ii) neck (optional): deep features to compact feature vectors.
+      iii) head (optional): define loss functions.
+      iv) memory_bank (optional): define memory banks.
+    3. Create a config file under `configs/` and setup the configs;
+    4. Create a hook file under `openselfsup/hooks/` if your method requires additional operations before run, every several iterations, every several epoch, or after run.
+    
+You may refer to existing modules under respective folders.
+
+#### Features may facilitate your implementation
+
+* Decoupled data source and dataset.
+
+Since dataset is correlated to a specific task while data source is general, we decouple data source and dataset in OpenSelfSup.
+
+```python
+data = dict(
+    train=dict(type='ContrastiveDataset',
+               data_source=dict(type='ImageNet', list_file='xx', root='xx'),
+               pipeline=train_pipeline),
+    val=dict(...),
+)
+```
+
+* Configure data augmentations in the config file.
+
+The augmentations are the same as `torchvision.transforms`. `torchvision.transforms.RandomAppy` corresponds to `RandomAppliedTrans`. `Lighting` and `GaussianBlur` is additionally implemented.
+
+```python
+train_pipeline = [
+    dict(type='RandomResizedCrop', size=224),
+    dict(type='RandomAppliedTrans',
+        transforms=[
+            dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, kernel_size=23)],
+        p=0.5),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg)
+]
+```
+
+* Parameter-wise optimization parameters.
+
+You may specify optimization paramters including lr, momentum and weight_decay for a certain group of paramters in the config file with `paramwise_options`. `paramwise_options` is a dict whose key is regular expressions and value is options. Options include 6 fields: lr, lr_mult, momentum, momentum_mult, weight_decay, weight_decay_mult.
+
+```python
+paramwise_options = {
+    '(bn|gn)(\d+)?.(weight|bias)': dict(weight_decay_mult=0.1),
+    '\Ahead.': dict(lr_mult=10, momentum=0)}
+optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+                     weight_decay=0.0001,
+                     paramwise_options=paramwise_options)
+```
+
+* Configure custom hooks in the config file.
+
+The hooks will be called in order. For hook design, please refer to [odc_hook.py](openselfsup/hooks/odc_hook.py) as an example.
+
+```python
+custom_hooks = [
+    dict(type='DeepClusterHook', **kwargs1),
+    dict(type='ODCHook', **kwargs2),
+]
+```
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@ -0,0 +1,146 @@
+## Installation
+
+### Requirements
+
+- Linux (Windows is not officially supported)
+- Python 3.5+
+- PyTorch 1.1 or higher
+- CUDA 9.0 or higher
+- NCCL 2
+- GCC 4.9 or higher
+- [mmcv](https://github.com/open-mmlab/mmcv)
+
+We have tested the following versions of OS and softwares:
+
+- OS: Ubuntu 16.04/18.04 and CentOS 7.2
+- CUDA: 9.0/9.2/10.0/10.1
+- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2
+- GCC(G++): 4.9/5.3/5.4/7.3
+
+### Install openselfsup
+
+a. Create a conda virtual environment and activate it.
+
+```shell
+conda create -n open-mmlab python=3.7 -y
+conda activate open-mmlab
+```
+
+b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/), e.g.,
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+c. Install other third-party libraries.
+
+```shell
+conda install faiss-gpu cudatoolkit=10.0 -c pytorch # optional for DeepCluster and ODC, assuming CUDA=10.0
+```
+
+d. Clone the openselfsup repository.
+
+```shell
+git clone https://github.com/open-mmlab/openselfsup.git
+cd openselfsup
+```
+
+e. Install.
+
+```shell
+pip install -v -e .  # or "python setup.py develop"
+```
+
+Note:
+
+1. The git commit id will be written to the version number with step d, e.g. 0.6.0+2e7045c. The version will also be saved in trained models.
+
+2. Following the above instructions, openselfsup is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number).
+
+3. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+you can install it before installing MMCV.
+
+4. Some dependencies are optional. Simply running `pip install -v -e .` will only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
+
+
+### Prepare datasets
+
+It is recommended to symlink your dataset root (assuming $YOUR_DATA_ROOT) to `$OPENSELFSUP/data`.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+
+#### Prepare PASCAL VOC
+
+Assuming that you usually store datasets in `$YOUR_DATA_ROOT` (e.g., for me, `/home/xhzhan/data/`).
+This script will automatically download PASCAL VOC 2007 into `$YOUR_DATA_ROOT`, prepare the required files, create a folder `data` under `$OPENSELFSUP` and make a symlink `VOCdevkit`.
+
+```shell
+cd $OPENSELFSUP
+bash tools/prepare_data/prepare_voc07_cls.sh $YOUR_DATA_ROOT
+```
+
+#### Prepare ImageNet and Places205
+
+Taking ImageNet for example,y ou need to 1) download ImageNet; 2) create list files under $IAMGENET/meta/, `train.txt` contains an image file name in each line, `train_labeled.txt` contains `filename[space]label\n` in each line; 3) create a symlink under `$OPENSELFSUP/data/`.
+
+At last, the folder looks like:
+
+```
+OpenSelfSup
+├── openselfsup
+├── benchmarks
+├── configs
+├── data
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+│   ├── imagenet
+│   │   ├── meta
+│   │   |   ├── train.txt ("filename\n" in each line)
+│   │   |   ├── train_labeled.txt ("filename[space]label\n" in each line)
+│   │   |   ├── val.txt
+│   │   |   ├── val_labeled.txt
+│   │   ├── train
+│   │   ├── val
+│   ├── places
+│   │   ├── meta
+│   │   |   ├── train.txt
+│   │   |   ├── train_labeled.txt
+│   │   |   ├── val.txt
+│   │   |   ├── val_labeled.txt
+│   │   ├── train
+│   │   ├── val
+```
+
+### A from-scratch setup script
+
+Here is a full script for setting up openselfsup with conda and link the dataset path.
+
+```shell
+conda create -n open-mmlab python=3.7 -y
+conda activate open-mmlab
+
+conda install -c pytorch pytorch torchvision -y
+git clone https://github.com/open-mmlab/OpenSelfSup.git
+cd OpenSelfSup
+pip install -v -e .
+
+bash tools/prepare_data/prepare_voc07_cls.sh $YOUR_DATA_ROOT
+ln -s $IMAGENET_ROOT data
+ln -s $PLACES_ROOT data
+```
+
+### Using multiple OpenSelfSup versions
+
+If there are more than one openselfsup on your machine, and you want to use them alternatively, the recommended way is to create multiple conda environments and use different environments for different versions.
+
+Another way is to insert the following code to the main scripts (`train.py`, `test.py` or any other scripts you run)
+```python
+import os.path as osp
+import sys
+sys.path.insert(0, osp.join(osp.dirname(osp.abspath(__file__)), '../'))
+```
+
+Or run the following command in the terminal of corresponding folder to temporally use the current one.
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
--- a/docs/MODEL_ZOO.md
+++ b/docs/MODEL_ZOO.md
@ -0,0 +1 @@
+#Model Zoo
--- a/docs/relation.jpg
+++ b/docs/relation.jpg
--- a/openselfsup/init.py
+++ b/openselfsup/init.py
@ -0,0 +1,3 @@
+from .version import __version__, short_version
+
+__all__ = ['__version__', 'short_version']
--- a/openselfsup/apis/init.py
+++ b/openselfsup/apis/init.py
@ -0,0 +1 @@
+from .train import get_root_logger, set_random_seed, train_model
--- a/openselfsup/apis/train.py
+++ b/openselfsup/apis/train.py
@ -0,0 +1,275 @@
+import random
+import re
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import DistSamplerSeedHook, Runner, obj_from_dict
+
+from openselfsup.datasets import build_dataloader
+from openselfsup.hooks import build_hook, DistOptimizerHook
+from openselfsup.utils import get_root_logger, optimizers, print_log
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def parse_losses(losses):
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(
+                '{} is not a tensor or list of tensors'.format(loss_name))
+
+    loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
+
+    log_vars['loss'] = loss
+    for loss_name, loss_value in log_vars.items():
+        # reduce loss when distributed training
+        if dist.is_available() and dist.is_initialized():
+            loss_value = loss_value.data.clone()
+            dist.all_reduce(loss_value.div_(dist.get_world_size()))
+        log_vars[loss_name] = loss_value.item()
+
+    return loss, log_vars
+
+
+def batch_processor(model, data, train_mode):
+    """Process a data batch.
+
+    This method is required as an argument of Runner, which defines how to
+    process a data batch and obtain proper outputs. The first 3 arguments of
+    batch_processor are fixed.
+
+    Args:
+        model (nn.Module): A PyTorch model.
+        data (dict): The data batch in a dict.
+        train_mode (bool): Training mode or not. It may be useless for some
+            models.
+
+    Returns:
+        dict: A dict containing losses and log vars.
+    """
+    assert model.training, "Must be in training mode."
+    losses = model(**data)
+    loss, log_vars = parse_losses(losses)
+
+    outputs = dict(
+        loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+    return outputs
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                timestamp=None,
+                meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # start training
+    if distributed:
+        _dist_train(
+            model, dataset, cfg, logger=logger, timestamp=timestamp, meta=meta)
+    else:
+        _non_dist_train(
+            model, dataset, cfg, logger=logger, timestamp=timestamp, meta=meta)
+
+
+def build_optimizer(model, optimizer_cfg):
+    """Build optimizer from configs.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are:
+                - type: class name of the optimizer.
+                - lr: base learning rate.
+            Optional fields are:
+                - any arguments of the corresponding optimizer type, e.g.,
+                  weight_decay, momentum, etc.
+                - paramwise_options: a dict with regular expression as keys
+                  to match parameter names and a dict containing options as
+                  values. Options include 6 fields: lr, lr_mult, momentum,
+                  momentum_mult, weight_decay, weight_decay_mult.
+
+    Returns:
+        torch.optim.Optimizer: The initialized optimizer.
+
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> paramwise_options = {
+        >>>     '(bn|gn)(\d+)?.(weight|bias)': dict(weight_decay_mult=0.1),
+        >>>     '\Ahead.': dict(lr_mult=10, momentum=0)}
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001,
+        >>>                      paramwise_options=paramwise_options)
+        >>> optimizer = build_optimizer(model, optimizer_cfg)
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    optimizer_cfg = optimizer_cfg.copy()
+    paramwise_options = optimizer_cfg.pop('paramwise_options', None)
+    # if no paramwise option is specified, just use the global setting
+    if paramwise_options is None:
+        return obj_from_dict(optimizer_cfg, optimizers,
+                             dict(params=model.parameters()))
+    else:
+        assert isinstance(paramwise_options, dict)
+        params = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            for regexp, options in paramwise_options.items():
+                if re.search(regexp, name):
+                    for key, value in options.items():
+                        if key.endswith('_mult'): # is a multiplier
+                            key = key[:-5]
+                            assert key in optimizer_cfg, \
+                                "{} not in optimizer_cfg".format(key)
+                            value = optimizer_cfg[key] * value
+                        param_group[key] = value
+                        if not dist.is_initialized() or dist.get_rank() == 0:
+                            print_log('paramwise_options -- {}: {}={}'.format(
+                                name, key, value))
+
+            # otherwise use the global settings
+            params.append(param_group)
+
+        optimizer_cls = getattr(optimizers, optimizer_cfg.pop('type'))
+        return optimizer_cls(params, **optimizer_cfg)
+
+
+def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            dist=True,
+            shuffle=True,
+            replace=getattr(cfg.data, 'sampling_replace', False),
+            seed=cfg.seed,
+            drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset
+    ]
+    # put model on gpus
+    model = MMDistributedDataParallel(
+        model.cuda(),
+        device_ids=[torch.cuda.current_device()],
+        broadcast_buffers=False)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+    runner.register_hook(DistSamplerSeedHook())
+    # register custom hooks
+    for hook in cfg.get('custom_hooks', ()):
+        if hook.type == 'DeepClusterHook':
+            common_params = dict(dist_mode=True, data_loaders=data_loaders)
+        else:
+            common_params = dict(dist_mode=True)
+        runner.register_hook(build_hook(hook, common_params))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
+
+
+def _non_dist_train(model,
+                    dataset,
+                    cfg,
+                    validate=False,
+                    logger=None,
+                    timestamp=None,
+                    meta=None):
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            cfg.gpus,
+            dist=False,
+            shuffle=True,
+            replace=getattr(cfg.data, 'sampling_replace', False),
+            seed=cfg.seed,
+            drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset
+    ]
+    # put model on gpus
+    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+    optimizer_config = cfg.optimizer_config
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+
+    # register custom hooks
+    for hook in cfg.get('custom_hooks', ()):
+        if hook.type == 'DeepClusterHook':
+            common_params = dict(dist_mode=False, data_loaders=data_loaders)
+        else:
+            common_params = dict(dist_mode=False)
+        runner.register_hook(build_hook(hook, common_params))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
--- a/openselfsup/datasets/init.py
+++ b/openselfsup/datasets/init.py
@ -0,0 +1,12 @@
+from .builder import build_dataset
+from .data_sources import *
+from .pipelines import *
+from .classification import ClassificationDataset
+from .deepcluster import DeepClusterDataset
+from .extraction import ExtractDataset
+from .npid import NPIDDataset
+from .rotation_pred import RotationPredDataset
+from .contrastive import ContrastiveDataset
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
+from .registry import DATASETS
--- a/openselfsup/datasets/base.py
+++ b/openselfsup/datasets/base.py
@ -0,0 +1,32 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+from torch.utils.data import Dataset
+
+from openselfsup.utils import print_log, build_from_cfg
+
+from torchvision.transforms import Compose
+
+from .registry import DATASETS, PIPELINES
+from .builder import build_datasource
+
+
+class BaseDataset(Dataset, metaclass=ABCMeta):
+    """Base Dataset
+    """
+
+    def __init__(self, data_source, pipeline):
+        self.data_source = build_datasource(data_source)
+        pipeline = [build_from_cfg(p, PIPELINES) for p in pipeline]
+        self.pipeline = Compose(pipeline)
+
+    def __len__(self):
+        return self.data_source.get_length()
+
+    @abstractmethod
+    def __getitem__(self, idx):
+        pass
+
+    @abstractmethod
+    def evaluate(self, scores, keyword, logger=None, **kwargs):
+        pass
--- a/openselfsup/datasets/builder.py
+++ b/openselfsup/datasets/builder.py
@ -0,0 +1,43 @@
+import copy
+
+from openselfsup.utils import build_from_cfg
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .registry import DATASETS, DATASOURCES
+
+
+def _concat_dataset(cfg, default_args=None):
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets)
+
+
+def build_dataset(cfg, default_args=None):
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_datasource(cfg):
+    return build_from_cfg(cfg, DATASOURCES)
--- a/openselfsup/datasets/classification.py
+++ b/openselfsup/datasets/classification.py
@ -0,0 +1,43 @@
+import torch
+
+from openselfsup.utils import print_log
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ClassificationDataset(BaseDataset):
+    """Dataset for classification
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ClassificationDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, target = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img, gt_label=target)
+
+    def evaluate(self, scores, keyword, logger=None, topk=(1, 5)):
+        '''results: Tensor (NxC)
+        '''
+        eval_res = {}
+
+        target = torch.LongTensor(self.data_source.labels)
+        assert scores.size(0) == target.size(0), \
+            "Inconsistent length for results and labels, {} vs {}".format(
+            scores.size(0), target.size(0))
+        num = scores.size(0)
+        _, pred = scores.topk(max(topk), dim=1, largest=True, sorted=True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))  # KxN
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0).item()
+            acc = correct_k * 100.0 / num
+            eval_res["{}_acc@{}".format(keyword, k)] = acc
+            if logger is not None and logger != 'silent':
+                print_log(
+                    "{}_acc@{}: {:.03f}".format(keyword, k, acc),
+                    logger=logger)
+        return eval_res
--- a/openselfsup/datasets/contrastive.py
+++ b/openselfsup/datasets/contrastive.py
@ -0,0 +1,23 @@
+import torch
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ContrastiveDataset(BaseDataset):
+    """Dataset for rotation prediction 
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ContrastiveDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img1 = self.pipeline(img)
+        img2 = self.pipeline(img)
+        img_cat = torch.cat((img1.unsqueeze(0), img2.unsqueeze(0)), dim=0)
+        return dict(img=img_cat)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
--- a/openselfsup/datasets/data_sources/init.py
+++ b/openselfsup/datasets/data_sources/init.py
@ -0,0 +1,3 @@
+from .cifar import Cifar10, Cifar100
+from .image_list import ImageList
+from .imagenet import ImageNet
--- a/openselfsup/datasets/data_sources/cifar.py
+++ b/openselfsup/datasets/data_sources/cifar.py
@ -0,0 +1,55 @@
+from PIL import Image
+
+from torchvision.datasets import CIFAR10, CIFAR100
+
+from ..registry import DATASOURCES
+
+
+@DATASOURCES.register_module
+class Cifar10(object):
+
+    CLASSES = [
+        'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
+        'horse', 'ship', 'truck'
+    ]
+
+    def __init__(self, root, split):
+        assert split in ['train', 'test']
+        try:
+            self.cifar = CIFAR10(
+                root=root, train=split == 'train', download=False)
+        except:
+            raise Exception("Please download CIFAR10 manually, \
+                  in case of downloading the dataset parallelly \
+                  that may corrupt the dataset.")
+        self.labels = self.cifar.targets
+
+    def get_length(self):
+        return len(self.cifar)
+
+    def get_sample(self, idx):
+        img = Image.fromarray(self.cifar.data[idx])
+        target = self.labels[idx]  # img: HWC, RGB
+        return img, target
+
+
+@DATASOURCES.register_module
+class Cifar100(object):
+
+    CLASSES = None
+
+    def __init__(self, root, split):
+        assert split in ['train', 'test']
+        try:
+            self.cifar = CIFAR100(
+                root=root, train=spilt == 'train', download=False)
+        except:
+            raise Exception("Please download CIFAR10 manually, \
+                  in case of downloading the dataset parallelly \
+                  that may corrupt the dataset.")
+        self.labels = self.cifar.targets
+
+    def get_sample(self, idx):
+        img = Image.fromarray(self.cifar.data[idx])
+        target = self.labels[idx]  # img: HWC, RGB
+        return img, target
--- a/openselfsup/datasets/data_sources/image_list.py
+++ b/openselfsup/datasets/data_sources/image_list.py
@ -0,0 +1,36 @@
+import os
+from PIL import Image
+
+from ..registry import DATASOURCES
+from .utils import McLoader
+
+
+@DATASOURCES.register_module
+class ImageList(object):
+
+    def __init__(self, root, list_file, memcached, mclient_path):
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        self.fns = [os.path.join(root, l.strip()) for l in lines]
+        self.memcached = memcached
+        self.mclient_path = mclient_path
+        self.initialized = False
+
+    def _init_memcached(self):
+        if not self.initialized:
+            assert self.mclient_path is not None
+            self.mc_loader = McLoader(self.mclient_path)
+            self.initialized = True
+
+    def get_length(self):
+        return len(self.fns)
+
+    def get_sample(self, idx):
+        if self.memcached:
+            self._init_memcached()
+        if self.memcached:
+            img = self.mc_loader(self.fns[idx])
+        else:
+            img = Image.open(self.fns[idx])
+        img = img.convert('RGB')
+        return img
--- a/openselfsup/datasets/data_sources/imagenet.py
+++ b/openselfsup/datasets/data_sources/imagenet.py
@ -0,0 +1,43 @@
+import os
+from PIL import Image
+
+from ..registry import DATASOURCES
+from .utils import McLoader
+
+
+@DATASOURCES.register_module
+class ImageNet(object):
+
+    def __init__(self, root, list_file, memcached, mclient_path):
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        self.has_labels = len(lines[0].split()) == 2
+        if self.has_labels:
+            self.fns, self.labels = zip(*[l.strip().split() for l in lines])
+            self.labels = [int(l) for l in self.labels]
+        else:
+            self.fns = [l.strip() for l in lines]
+        self.fns = [os.path.join(root, fn) for fn in self.fns]
+        self.memcached = memcached
+        self.mclient_path = mclient_path
+        self.initialized = False
+
+    def _init_memcached(self):
+        if not self.initialized:
+            assert self.mclient_path is not None
+            self.mc_loader = McLoader(self.mclient_path)
+            self.initialized = True
+
+    def get_length(self):
+        return len(self.fns)
+
+    def get_sample(self, idx):
+        if self.memcached:
+            self._init_memcached()
+        if self.memcached:
+            img = self.mc_loader(self.fns[idx])
+        else:
+            img = Image.open(self.fns[idx])
+        img = img.convert('RGB')
+        target = self.labels[idx] if self.has_labels else None
+        return img, target
--- a/openselfsup/datasets/data_sources/utils.py
+++ b/openselfsup/datasets/data_sources/utils.py
@ -0,0 +1,36 @@
+import io
+from PIL import Image
+try:
+    import mc
+except ImportError as E:
+    pass
+
+
+def pil_loader(img_str):
+    buff = io.BytesIO(img_str)
+    return Image.open(buff)
+
+
+class McLoader(object):
+
+    def __init__(self, mclient_path):
+        assert mclient_path is not None, \
+            "Please specify 'data_mclient_path' in the config."
+        self.mclient_path = mclient_path
+        server_list_config_file = "{}/server_list.conf".format(
+            self.mclient_path)
+        client_config_file = "{}/client.conf".format(self.mclient_path)
+        self.mclient = mc.MemcachedClient.GetInstance(server_list_config_file,
+                                                      client_config_file)
+
+    def __call__(self, fn):
+        try:
+            img_value = mc.pyvector()
+            self.mclient.Get(fn, img_value)
+            img_value_str = mc.ConvertBuffer(img_value)
+            img = pil_loader(img_value_str)
+        except:
+            print('Read image failed ({})'.format(fn))
+            return None
+        else:
+            return img
--- a/openselfsup/datasets/dataset_wrappers.py
+++ b/openselfsup/datasets/dataset_wrappers.py
@ -0,0 +1,55 @@
+import numpy as np
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .registry import DATASETS
+
+
+@DATASETS.register_module
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    concat the group flag for image aspect ratio.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+    """
+
+    def __init__(self, datasets):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        if hasattr(datasets[0], 'flag'):
+            flags = []
+            for i in range(0, len(datasets)):
+                flags.append(datasets[i].flag)
+            self.flag = np.concatenate(flags)
+
+
+@DATASETS.register_module
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.tile(self.dataset.flag, times)
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        return self.times * self._ori_len
--- a/openselfsup/datasets/deepcluster.py
+++ b/openselfsup/datasets/deepcluster.py
@ -0,0 +1,29 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class DeepClusterDataset(BaseDataset):
+    """Dataset for DC and ODC.
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(DeepClusterDataset, self).__init__(data_source, pipeline)
+        # init clustering labels
+        self.labels = [-1 for _ in range(self.data_source.get_length())]
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        label = self.labels[idx]
+        img = self.pipeline(img)
+        return dict(img=img, pseudo_label=label, idx=idx)
+
+    def assign_labels(self, labels):
+        assert len(self.labels) == len(labels), \
+            "Inconsistent lenght of asigned labels, \
+            {} vs {}".format(len(self.labels), len(labels))
+        self.labels = labels[:]
+
+    def evaluate(self, scores, keyword, logger=None):
+
+        raise NotImplemented
--- a/openselfsup/datasets/extraction.py
+++ b/openselfsup/datasets/extraction.py
@ -0,0 +1,19 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class ExtractDataset(BaseDataset):
+    """Dataset for feature extraction
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(ExtractDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
--- a/openselfsup/datasets/loader/init.py
+++ b/openselfsup/datasets/loader/init.py
@ -0,0 +1,7 @@
+from .build_loader import build_dataloader
+from .sampler import DistributedGroupSampler, GroupSampler, DistributedGivenIterationSampler
+
+__all__ = [
+    'GroupSampler', 'DistributedGroupSampler', 'build_dataloader',
+    'DistributedGivenIterationSampler'
+]
--- a/openselfsup/datasets/loader/build_loader.py
+++ b/openselfsup/datasets/loader/build_loader.py
@ -0,0 +1,81 @@
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from torch.utils.data import DataLoader
+
+#from .sampler import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .sampler import DistributedSampler, DistributedGivenIterationSampler
+from torch.utils.data import RandomSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
+def build_dataloader(dataset,
+                     imgs_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     replace=False,
+                     seed=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        imgs_per_gpu (int): Number of images on each GPU, i.e., batch size of
+            each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        replace (bool): Replace or not in random shuffle.
+            It works on when shuffle is True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    if dist:
+        rank, world_size = get_dist_info()
+        sampler = DistributedSampler(
+            dataset, world_size, rank, shuffle=shuffle, replace=replace)
+        batch_size = imgs_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        if replace:
+            raise NotImplemented
+        sampler = RandomSampler(
+            dataset) if shuffle else None  # TODO: set replace
+        batch_size = num_gpus * imgs_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
+        pin_memory=False,
+        worker_init_fn=worker_init_fn if seed is not None else None,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(seed):
+    np.random.seed(seed)
+    random.seed(seed)
--- a/openselfsup/datasets/loader/sampler.py
+++ b/openselfsup/datasets/loader/sampler.py
@ -0,0 +1,299 @@
+from __future__ import division
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from torch.utils.data import Sampler
+
+
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 replace=False):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+        self.replace = replace
+        self.unif_sampling_flag = False
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if not self.unif_sampling_flag:
+            self.generate_new_list()
+        else:
+            self.unif_sampling_flag = False
+        return iter(self.indices[self.rank * self.num_samples:(self.rank + 1) *
+                                 self.num_samples])
+
+    def generate_new_list(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            if self.replace:
+                indices = torch.randint(
+                    low=0,
+                    high=len(self.dataset),
+                    size=(len(self.dataset), ),
+                    generator=g).tolist()
+            else:
+                indices = torch.randperm(
+                    len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        self.indices = indices
+
+    def set_uniform_indices(self, labels, num_classes):
+        self.unif_sampling_flag = True
+        assert self.shuffle, "Using uniform sampling, the indices must be shuffled."
+        np.random.seed(self.epoch)
+        assert (len(labels) == len(self.dataset))
+        N = len(labels)
+        size_per_label = int(N / num_classes) + 1
+        indices = []
+        images_lists = [[] for i in range(num_classes)]
+        for i, l in enumerate(labels):
+            images_lists[l].append(i)
+        for i, l in enumerate(images_lists):
+            if len(l) == 0:
+                continue
+            indices.extend(
+                np.random.choice(
+                    l, size_per_label, replace=(len(l) <= size_per_label)))
+        indices = np.array(indices)
+        np.random.shuffle(indices)
+        indices = indices[:N].astype(np.int).tolist()
+
+        # add extra samples to make it evenly divisible
+        assert len(indices) <= self.total_size, \
+            "{} vs {}".format(len(indices), self.total_size)
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size, \
+            "{} vs {}".format(len(indices), self.total_size)
+        self.indices = indices
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+class DistributedGivenIterationSampler(Sampler):
+
+    def __init__(self,
+                 dataset,
+                 total_iter,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 last_iter=-1):
+        rank, world_size = get_dist_info()
+        assert rank < world_size
+        self.dataset = dataset
+        self.total_iter = total_iter
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.rank = rank
+        self.last_iter = last_iter
+
+        self.total_size = self.total_iter * self.batch_size
+
+        self.indices = self.gen_new_list()
+
+    def __iter__(self):
+        return iter(self.indices[(self.last_iter + 1) * self.batch_size:])
+
+    def set_uniform_indices(self, labels, num_classes):
+        np.random.seed(0)
+        assert (len(labels) == len(self.dataset))
+        N = len(labels)
+        size_per_label = int(N / num_classes) + 1
+        indices = []
+        images_lists = [[] for i in range(num_classes)]
+        for i, l in enumerate(labels):
+            images_lists[l].append(i)
+        for i, l in enumerate(images_lists):
+            if len(l) == 0:
+                continue
+            indices.extend(
+                np.random.choice(
+                    l, size_per_label, replace=(len(l) <= size_per_label)))
+        indices = np.array(indices)
+        np.random.shuffle(indices)
+        indices = indices[:N].astype(np.int)
+        # repeat
+        all_size = self.total_size * self.world_size
+        indices = indices[:all_size]
+        num_repeat = (all_size - 1) // indices.shape[0] + 1
+        indices = np.tile(indices, num_repeat)
+        indices = indices[:all_size]
+        np.random.shuffle(indices)
+        # slice
+        beg = self.total_size * self.rank
+        indices = indices[beg:beg + self.total_size]
+        assert len(indices) == self.total_size
+        # set
+        self.indices = indices
+
+    def gen_new_list(self):
+
+        # each process shuffle all list with same seed, and pick one piece according to rank
+        np.random.seed(0)
+
+        all_size = self.total_size * self.world_size
+        indices = np.arange(len(self.dataset))
+        indices = indices[:all_size]
+        num_repeat = (all_size - 1) // indices.shape[0] + 1
+        indices = np.tile(indices, num_repeat)
+        indices = indices[:all_size]
+
+        np.random.shuffle(indices)
+        beg = self.total_size * self.rank
+        indices = indices[beg:beg + self.total_size]
+
+        assert len(indices) == self.total_size
+
+        return indices
+
+    def __len__(self):
+        # note here we do not take last iter into consideration, since __len__
+        # should only be used for displaying, the correct remaining size is
+        # handled by dataloader
+        #return self.total_size - (self.last_iter+1)*self.batch_size
+        return self.total_size
+
+    def set_epoch(self, epoch):
+        pass
--- a/openselfsup/datasets/npid.py
+++ b/openselfsup/datasets/npid.py
@ -0,0 +1,20 @@
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+@DATASETS.register_module
+class NPIDDataset(BaseDataset):
+    """Dataset for NPID.
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(NPIDDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        return dict(img=img, idx=idx)
+
+    def evaluate(self, scores, keyword, logger=None):
+
+        raise NotImplemented
--- a/openselfsup/datasets/pipelines/init.py
+++ b/openselfsup/datasets/pipelines/init.py
@ -0,0 +1 @@
+from .transforms import *
--- a/openselfsup/datasets/pipelines/transforms.py
+++ b/openselfsup/datasets/pipelines/transforms.py
@ -0,0 +1,92 @@
+import cv2
+import inspect
+import numpy as np
+from PIL import Image
+
+import torch
+from torchvision import transforms as _transforms
+
+from openselfsup.utils import build_from_cfg
+
+from ..registry import PIPELINES
+
+# register all existing transforms in torchvision
+for m in inspect.getmembers(_transforms, inspect.isclass):
+    PIPELINES.register_module(m[1])
+
+
+@PIPELINES.register_module
+class RandomAppliedTrans(object):
+    '''Randomly applied transformations.
+    Args:
+        transforms (List[Dict]): List of transformations in dictionaries.
+    '''
+
+    def __init__(self, transforms, p=0.5):
+        t = [build_from_cfg(t, PIPELINES) for t in transforms]
+        self.trans = _transforms.RandomApply(t, p=p)
+
+    def __call__(self, img):
+        return self.trans(img)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+# custom transforms
+@PIPELINES.register_module
+class Lighting(object):
+    """Lighting noise(AlexNet - style PCA - based noise)"""
+    _IMAGENET_PCA = {
+        'eigval':
+        torch.Tensor([0.2175, 0.0188, 0.0045]),
+        'eigvec':
+        torch.Tensor([
+            [-0.5675, 0.7192, 0.4009],
+            [-0.5808, -0.0045, -0.8140],
+            [-0.5836, -0.6948, 0.4203],
+        ])
+    }
+
+    def __init__(self):
+        self.alphastd = 0.1
+        self.eigval = self._IMAGENET_PCA['eigval']
+        self.eigvec = self._IMAGENET_PCA['eigvec']
+
+    def __call__(self, img):
+        assert isinstance(img, torch.Tensor), \
+            "Expect torch.Tensor, got {}".format(type(img))
+        if self.alphastd == 0:
+            return img
+
+        alpha = img.new().resize_(3).normal_(0, self.alphastd)
+        rgb = self.eigvec.type_as(img).clone()\
+            .mul(alpha.view(1, 3).expand(3, 3))\
+            .mul(self.eigval.view(1, 3).expand(3, 3))\
+            .sum(1).squeeze()
+
+        return img.add(rgb.view(3, 1, 1).expand_as(img))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+@PIPELINES.register_module
+class GaussianBlur(object):
+
+    def __init__(self, sigma_min, sigma_max, kernel_size):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.kernel_size = kernel_size
+
+    def __call__(self, img):
+        sigma = np.random.uniform(self.sigma_min, self.sigma_max)
+        img = cv2.GaussianBlur(
+            np.array(img), (self.kernel_size, self.kernel_size), sigma)
+        return Image.fromarray(img.astype(np.uint8))
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str
--- a/openselfsup/datasets/registry.py
+++ b/openselfsup/datasets/registry.py
@ -0,0 +1,5 @@
+from openselfsup.utils import Registry
+
+DATASOURCES = Registry('datasource')
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
--- a/openselfsup/datasets/rotation_pred.py
+++ b/openselfsup/datasets/rotation_pred.py
@ -0,0 +1,35 @@
+import torch
+
+from .registry import DATASETS
+from .base import BaseDataset
+
+
+def rotate(img):
+    '''
+    img: Tensor(CHW)
+    '''
+    return [
+        img,
+        torch.flip(img.transpose(1, 2), [1]),
+        torch.flip(img, [1, 2]),
+        torch.flip(img, [1]).transpose(1, 2)
+    ]
+
+
+@DATASETS.register_module
+class RotationPredDataset(BaseDataset):
+    """Dataset for rotation prediction 
+    """
+
+    def __init__(self, data_source, pipeline):
+        super(RotationPredDataset, self).__init__(data_source, pipeline)
+
+    def __getitem__(self, idx):
+        img, _ = self.data_source.get_sample(idx)
+        img = self.pipeline(img)
+        img = torch.stack(rotate(img), dim=0)
+        rotation_labels = torch.LongTensor([0, 1, 2, 3])
+        return dict(img=img, rot_label=rotation_labels)
+
+    def evaluate(self, scores, keyword, logger=None):
+        raise NotImplemented
--- a/openselfsup/hooks/init.py
+++ b/openselfsup/hooks/init.py
@ -0,0 +1,7 @@
+from .builder import build_hook
+from .deepcluster_hook import DeepClusterHook
+from .odc_hook import ODCHook
+from .optimizer_hook import DistOptimizerHook
+from .extractor import Extractor
+from .validate_hook import ValidateHook
+from .registry import HOOKS
--- a/openselfsup/hooks/builder.py
+++ b/openselfsup/hooks/builder.py
@ -0,0 +1,7 @@
+from openselfsup.utils import build_from_cfg
+
+from .registry import HOOKS
+
+
+def build_hook(cfg, default_args=None):
+    return build_from_cfg(cfg, HOOKS, default_args)
--- a/openselfsup/hooks/deepcluster_hook.py
+++ b/openselfsup/hooks/deepcluster_hook.py
@ -0,0 +1,109 @@
+import numpy as np
+
+from mmcv.runner import Hook
+
+import torch
+import torch.distributed as dist
+
+from openselfsup.third_party import clustering as _clustering
+from openselfsup.utils import print_log
+from .registry import HOOKS
+from .extractor import Extractor
+
+
+@HOOKS.register_module
+class DeepClusterHook(Hook):
+
+    def __init__(
+            self,
+            extractor,
+            clustering,
+            unif_sampling,
+            reweight,
+            reweight_pow,
+            init_memory=False,  # for ODC
+            initial=True,
+            interval=1,
+            dist_mode=True,
+            data_loaders=None):
+        self.extractor = Extractor(dist_mode=dist_mode, **extractor)
+        self.clustering_type = clustering.pop('type')
+        self.clustering_cfg = clustering
+        self.unif_sampling = unif_sampling
+        self.reweight = reweight
+        self.reweight_pow = reweight_pow
+        self.init_memory = init_memory
+        self.initial = initial
+        self.interval = interval
+        self.dist_mode = dist_mode
+        self.data_loaders = data_loaders
+
+    def before_run(self, runner):
+        if self.initial:
+            self.deepcluster(runner)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        self.deepcluster(runner)
+
+    def deepcluster(self, runner):
+        # step 1: get features
+        runner.model.eval()
+        features = self.extractor(runner)
+        runner.model.train()
+
+        # step 2: get labels
+        if not self.dist_mode or (self.dist_mode and runner.rank == 0):
+            clustering_algo = _clustering.__dict__[self.clustering_type](
+                **self.clustering_cfg)
+            # Features are normalized during clustering
+            clustering_algo.cluster(features, verbose=True)
+            assert isinstance(clustering_algo.labels, np.ndarray)
+            new_labels = clustering_algo.labels.astype(np.int64)
+            np.save(
+                "{}/cluster_epoch_{}.npy".format(runner.work_dir,
+                                                 runner.epoch), new_labels)
+            self.evaluate(runner, new_labels)
+        else:
+            new_labels = np.zeros((len(self.data_loaders[0].dataset), ),
+                                  dtype=np.int64)
+
+        if self.dist_mode:
+            new_labels_tensor = torch.from_numpy(new_labels).cuda()
+            dist.broadcast(new_labels_tensor, 0)
+            new_labels = new_labels_tensor.cpu().numpy()
+        new_labels_list = list(new_labels)
+
+        # step 3: assign new labels
+        self.data_loaders[0].dataset.assign_labels(new_labels_list)
+
+        # step 4 (a): set uniform sampler
+        if self.unif_sampling:
+            self.data_loaders[0].sampler.set_uniform_indices(
+                new_labels_list, self.clustering_cfg.k)
+
+        # step 4 (b): set loss reweight
+        if self.reweight:
+            runner.model.module.set_reweight(new_labels, self.reweight_pow)
+
+        # step 5: randomize classifier
+        runner.model.module.head.init_weights(init_linear='normal')
+        if self.dist_mode:
+            for p in runner.model.module.head.state_dict().values():
+                dist.broadcast(p, 0)
+
+        # step 6: init memory for ODC
+        if self.init_memory:
+            runner.model.module.memory_bank.init_memory(features, new_labels)
+
+    def evaluate(self, runner, new_labels):
+        hist = np.bincount(new_labels, minlength=self.clustering_cfg.k)
+        empty_cls = (hist == 0).sum()
+        minimal_cls_size, maximal_cls_size = hist.min(), hist.max()
+        if runner.rank == 0:
+            print_log(
+                "empty_num: {}\tmin_cluster: {}\tmax_cluster:{}".format(
+                    empty_cls.item(), minimal_cls_size.item(),
+                    maximal_cls_size.item()),
+                logger='root')
--- a/openselfsup/hooks/extractor.py
+++ b/openselfsup/hooks/extractor.py
@ -0,0 +1,50 @@
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from openselfsup.utils import nondist_forward_collect, dist_forward_collect
+
+
+class Extractor(object):
+
+    def __init__(self,
+                 dataset,
+                 imgs_per_gpu,
+                 workers_per_gpu,
+                 dist_mode=False):
+        from openselfsup import datasets
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset)
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.data_loader = datasets.build_dataloader(
+            self.dataset,
+            imgs_per_gpu,
+            workers_per_gpu,
+            dist=dist_mode,
+            shuffle=False)
+        self.dist_mode = dist_mode
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _forward_func(self, runner, **x):
+        backbone_feat = runner.model(mode='extract', **x)
+        last_layer_feat = runner.model.module.neck([backbone_feat[-1]])[0]
+        last_layer_feat = last_layer_feat.view(last_layer_feat.size(0), -1)
+        return dict(feature=last_layer_feat.cpu())
+
+    def __call__(self, runner):
+        func = lambda **x: self._forward_func(runner, **x)
+        if self.dist_mode:
+            feats = dist_forward_collect(
+                func,
+                self.data_loader,
+                runner.rank,
+                len(self.dataset),
+                ret_rank=-1)['feature']  # NxD
+        else:
+            feats = nondist_forward_collect(func, self.data_loader,
+                                            len(self.dataset))['feature']
+        return feats
--- a/openselfsup/hooks/odc_hook.py
+++ b/openselfsup/hooks/odc_hook.py
@ -0,0 +1,67 @@
+import numpy as np
+
+from mmcv.runner import Hook
+
+from openselfsup.utils import print_log
+from .registry import HOOKS
+
+
+@HOOKS.register_module
+class ODCHook(Hook):
+
+    def __init__(self,
+                 centroids_update_interval,
+                 deal_with_small_clusters_interval,
+                 evaluate_interval,
+                 reweight,
+                 reweight_pow,
+                 dist_mode=True):
+        assert dist_mode, "non-dist mode is not implemented"
+        self.centroids_update_interval = centroids_update_interval
+        self.deal_with_small_clusters_interval = \
+            deal_with_small_clusters_interval
+        self.evaluate_interval = evaluate_interval
+        self.reweight = reweight
+        self.reweight_pow = reweight_pow
+
+    def after_train_iter(self, runner):
+        # centroids update
+        if self.every_n_iters(runner, self.centroids_update_interval):
+            runner.model.module.memory_bank.update_centroids_memory()
+
+        # deal with small clusters
+        if self.every_n_iters(runner, self.deal_with_small_clusters_interval):
+            runner.model.module.memory_bank.deal_with_small_clusters()
+
+        # reweight
+        runner.model.module.set_reweight()
+
+        # evaluate
+        if self.every_n_iters(runner, self.evaluate_interval):
+            new_labels = runner.model.module.memory_bank.label_bank
+            if new_labels.is_cuda:
+                new_labels = new_labels.cpu()
+            self.evaluate(runner, new_labels.numpy())
+
+    def after_train_epoch(self, runner):
+        # save cluster
+        if self.every_n_epochs(10) and runner.rank == 0:
+            new_labels = runner.model.module.memory_bank.label_bank
+            if new_labels.is_cuda:
+                new_labels = new_labels.cpu()
+            np.save(
+                "{}/cluster_epoch_{}.npy".format(runner.work_dir,
+                                                 runner.epoch),
+                new_labels.numpy())
+
+    def evaluate(self, runner, new_labels):
+        hist = np.bincount(
+            new_labels, minlength=runner.model.module.memory_bank.num_classes)
+        empty_cls = (hist == 0).sum()
+        minimal_cls_size, maximal_cls_size = hist.min(), hist.max()
+        if runner.rank == 0:
+            print_log(
+                "empty_num: {}\tmin_cluster: {}\tmax_cluster:{}".format(
+                    empty_cls.item(), minimal_cls_size.item(),
+                    maximal_cls_size.item()),
+                logger='root')
--- a/openselfsup/hooks/optimizer_hook.py
+++ b/openselfsup/hooks/optimizer_hook.py
@ -0,0 +1,16 @@
+from mmcv.runner import OptimizerHook
+
+
+class DistOptimizerHook(OptimizerHook):
+
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
--- a/openselfsup/hooks/registry.py
+++ b/openselfsup/hooks/registry.py
@ -0,0 +1,3 @@
+from openselfsup.utils import Registry
+
+HOOKS = Registry('hook')
--- a/openselfsup/hooks/validate_hook.py
+++ b/openselfsup/hooks/validate_hook.py
@ -0,0 +1,71 @@
+from mmcv.runner import Hook
+
+import torch
+from torch.utils.data import Dataset
+
+from openselfsup.utils import nondist_forward_collect, dist_forward_collect
+from .registry import HOOKS
+
+
+@HOOKS.register_module
+class ValidateHook(Hook):
+
+    def __init__(self,
+                 dataset,
+                 dist_mode=True,
+                 initial=True,
+                 interval=1,
+                 **eval_kwargs):
+        from openselfsup import datasets
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset)
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.data_loader = datasets.build_dataloader(
+            self.dataset,
+            eval_kwargs['imgs_per_gpu'],
+            eval_kwargs['workers_per_gpu'],
+            dist=dist_mode,
+            shuffle=False)
+        self.dist_mode = dist_mode
+        self.initial = initial
+        self.interval = interval
+        self.eval_kwargs = eval_kwargs
+
+    def before_run(self, runner):
+        if self.initial:
+            self._run_validate(runner)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        self._run_validate(runner)
+
+    def _run_validate(self, runner):
+        runner.model.eval()
+        func = lambda **x: runner.model(mode='test', **x)
+        if self.dist_mode:
+            results = dist_forward_collect(
+                func, self.data_loader, runner.rank,
+                len(self.dataset))  # dict{key: np.ndarray}
+        else:
+            results = nondist_forward_collect(func, self.data_loader,
+                                              len(self.dataset))
+        if runner.rank == 0:
+            for name, val in results.items():
+                self._evaluate(runner, torch.from_numpy(val), name)
+        runner.model.train()
+
+    def _evaluate(self, runner, results, keyword):
+        eval_res = self.dataset.evaluate(
+            results,
+            keyword=keyword,
+            logger=runner.logger,
+            **self.eval_kwargs['eval_param'])
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
--- a/openselfsup/models/init.py
+++ b/openselfsup/models/init.py
@ -0,0 +1,20 @@
+from .backbones import *  # noqa: F401,F403
+from .builder import (build_backbone, build_model, build_head, build_loss)
+from .heads import *
+from .classification import Classification
+from .deepcluster import DeepCluster
+from .odc import ODC
+from .losses import *  # noqa: F401,F403
+from .necks import *
+from .npid import NPID
+from .memories import *
+from .moco import MOCO
+from .registry import (BACKBONES, MODELS, NECKS, MEMORIES, HEADS, LOSSES)
+from .rotation_pred import RotationPred
+from .simclr import SimCLR
+
+#__all__ = [
+#    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+#    'DETECTORS', 'CLASSIFIERS', 'build_backbone', 'build_neck', 'build_roi_extractor',
+#    'build_shared_head', 'build_head', 'build_loss', 'build_detector', 'build_detector'
+#]
--- a/openselfsup/models/backbones/init.py
+++ b/openselfsup/models/backbones/init.py
@ -0,0 +1,6 @@
+#from .hrnet import HRNet
+from .resnet import ResNet, make_res_layer
+#from .resnext import ResNeXt
+#from .ssd_vgg import SSDVGG
+
+#__all__ = ['ResNet', 'make_res_layer', 'ResNeXt', 'SSDVGG', 'HRNet']
--- a/openselfsup/models/backbones/resnet.py
+++ b/openselfsup/models/backbones/resnet.py
@ -0,0 +1,429 @@
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from openselfsup.utils import get_root_logger
+from ..registry import BACKBONES
+from ..utils import build_conv_layer, build_norm_layer
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super(BasicBlock, self).__init__()
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        """Bottleneck block for ResNet.
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='pytorch',
+                   with_cp=False,
+                   conv_cfg=None,
+                   norm_cfg=dict(type='BN')):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1],
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes=inplanes,
+            planes=planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            style=style,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=1,
+                dilation=dilation,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from openselfsup.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3, 4),
+                 style='pytorch',
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False):
+        super(ResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError('invalid depth {} for resnet'.format(depth))
+        self.depth = depth
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages + 1
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = 64
+
+        self._make_stem_layer(in_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * 64 * 2**(
+            len(self.stage_blocks) - 1)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.norm1.eval()
+            for m in [self.conv1, self.norm1]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, 'layer{}'.format(i))
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)  # r50: 64x128x128
+        if 0 in self.out_indices:
+            outs.append(x)
+        x = self.maxpool(x)  # r50: 64x56x56
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i + 1 in self.out_indices:
+                outs.append(x)
+        # r50: 1-256x56x56; 2-512x28x28; 3-1024x14x14; 4-2048x7x7
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
--- a/openselfsup/models/backbones/resnext.py
+++ b/openselfsup/models/backbones/resnext.py
@ -0,0 +1,222 @@
+import math
+
+import torch.nn as nn
+
+from ..registry import BACKBONES
+from ..utils import build_conv_layer, build_norm_layer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+
+    def __init__(self, inplanes, planes, groups=1, base_width=4, **kwargs):
+        """Bottleneck block for ResNeXt.
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes * (base_width / 64)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   groups=1,
+                   base_width=4,
+                   style='pytorch',
+                   with_cp=False,
+                   conv_cfg=None,
+                   norm_cfg=dict(type='BN'),
+                   dcn=None,
+                   gcb=None):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1],
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes=inplanes,
+            planes=planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            groups=groups,
+            base_width=base_width,
+            style=style,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            dcn=dcn,
+            gcb=gcb))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=1,
+                dilation=dilation,
+                groups=groups,
+                base_width=base_width,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                gcb=gcb))
+
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from openselfsup.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        super(ResNeXt, self).__init__(**kwargs)
+        self.groups = groups
+        self.base_width = base_width
+
+        self.inplanes = 64
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            gcb = self.gcb if self.stage_with_gcb[i] else None
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                self.block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                groups=self.groups,
+                base_width=self.base_width,
+                style=self.style,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                gcb=gcb)
+            self.inplanes = planes * self.block.expansion
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
--- a/openselfsup/models/builder.py
+++ b/openselfsup/models/builder.py
@ -0,0 +1,38 @@
+from torch import nn
+
+from openselfsup.utils import build_from_cfg
+from .registry import (BACKBONES, MODELS, NECKS, HEADS, MEMORIES, LOSSES)
+
+
+def build(cfg, registry, default_args=None):
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return nn.Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_backbone(cfg):
+    return build(cfg, BACKBONES)
+
+
+def build_neck(cfg):
+    return build(cfg, NECKS)
+
+
+def build_memory(cfg):
+    return build(cfg, MEMORIES)
+
+
+def build_head(cfg):
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    return build(cfg, LOSSES)
+
+
+def build_model(cfg):
+    return build(cfg, MODELS)
--- a/openselfsup/models/classification.py
+++ b/openselfsup/models/classification.py
@ -0,0 +1,79 @@
+import numpy as np
+
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+from .utils import Sobel
+
+
+@MODELS.register_module
+class Classification(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 frozen_backbone=False,
+                 with_sobel=False,
+                 head=None,
+                 pretrained=None):
+        super(Classification, self).__init__()
+        self.with_sobel = with_sobel
+        if with_sobel:
+            self.sobel_layer = Sobel()
+        self.backbone = builder.build_backbone(backbone)
+        if frozen_backbone:
+            self.backbone.eval()
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+        if head is not None:
+            self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.head.init_weights()
+
+    def forward_backbone(self, img):
+        """Forward backbone
+
+        Returns:
+            x (tuple): backbone outputs
+        """
+        if self.with_sobel:
+            img = self.sobel_layer(img)
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, gt_label, **kwargs):
+        x = self.forward_backbone(img)
+        outs = self.head(x)
+        loss_inputs = (outs, gt_label)
+        losses = self.head.loss(*loss_inputs)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def aug_test(self, imgs):
+        raise NotImplemented
+        outs = np.mean([self.head(x) for x in self.forward_backbone(imgs)],
+                       axis=0)
+        return outs
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
--- a/openselfsup/models/deepcluster.py
+++ b/openselfsup/models/deepcluster.py
@ -0,0 +1,88 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+from .utils import Sobel
+
+
+@MODELS.register_module
+class DeepCluster(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 with_sobel=False,
+                 neck=None,
+                 head=None,
+                 pretrained=None):
+        super(DeepCluster, self).__init__()
+        self.with_sobel = with_sobel
+        if with_sobel:
+            self.sobel_layer = Sobel()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck)
+        if head is not None:
+            self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+        # reweight
+        self.num_classes = head.num_classes
+        self.loss_weight = torch.ones((self.num_classes, ),
+                                      dtype=torch.float32).cuda()
+        self.loss_weight /= self.loss_weight.sum()
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.neck.init_weights(init_linear='kaiming')
+        self.head.init_weights(init_linear='normal')
+
+    def forward_backbone(self, img):
+        """Forward backbone
+    
+        Returns:
+            x (tuple): backbone outputs
+        """
+        if self.with_sobel:
+            img = self.sobel_layer(img)
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img, pseudo_label, **kwargs):
+        x = self.forward_backbone(img)
+        assert len(x) == 1
+        feature = self.neck(x)
+        outs = self.head(feature)
+        loss_inputs = (outs, pseudo_label)
+        losses = self.head.loss(*loss_inputs)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        x = self.forward_backbone(img)  # tuple
+        outs = self.head(x)
+        keys = ['head{}'.format(i) for i in range(len(outs))]
+        out_tensors = [out.cpu() for out in outs]  # NxC
+        return dict(zip(keys, out_tensors))
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.forward_backbone(img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+    def set_reweight(self, labels, reweight_pow=0.5):
+        hist = np.bincount(
+            labels, minlength=self.num_classes).astype(np.float32)
+        inv_hist = (1. / (hist + 1e-10))**reweight_pow
+        weight = inv_hist / inv_hist.sum()
+        self.loss_weight.copy_(torch.from_numpy(weight))
+        self.head.criterion = nn.CrossEntropyLoss(weight=self.loss_weight)
--- a/openselfsup/models/heads/init.py
+++ b/openselfsup/models/heads/init.py
@ -0,0 +1,3 @@
+from .contrastive_head import ContrastiveHead
+from .cls_head import ClsHead
+from .multi_cls_head import MultiClsHead
--- a/openselfsup/models/heads/cls_head.py
+++ b/openselfsup/models/heads/cls_head.py
@ -0,0 +1,60 @@
+import torch.nn as nn
+from mmcv.cnn import kaiming_init, normal_init
+
+from ..utils import accuracy
+from ..registry import HEADS
+
+
+@HEADS.register_module
+class ClsHead(nn.Module):
+    """Simplest classifier head, with only one fc layer.
+    """
+
+    def __init__(self,
+                 with_avg_pool=False,
+                 in_channels=2048,
+                 num_classes=1000):
+        super(ClsHead, self).__init__()
+        self.with_avg_pool = with_avg_pool
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+
+        self.criterion = nn.CrossEntropyLoss()
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc_cls = nn.Linear(in_channels, num_classes)
+
+    def init_weights(self, init_linear='normal'):
+        assert init_linear in ['normal', 'kaiming'], \
+            "Undefined init_linear: {}".format(init_linear)
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                if init_linear == 'normal':
+                    normal_init(m, std=0.01)
+                else:
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert isinstance(x, (tuple, list)) and len(x) == 1
+        x = x[0]
+        if self.with_avg_pool:
+            assert x.dim() == 4, \
+                "Tensor must has 4 dims, got: {}".format(x.dim())
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x)
+        return [cls_score]
+
+    def loss(self, cls_score, labels):
+        losses = dict()
+        assert isinstance(cls_score, (tuple, list)) and len(cls_score) == 1
+        losses['loss'] = self.criterion(cls_score[0], labels)
+        losses['acc'] = accuracy(cls_score[0], labels)
+        return losses
--- a/openselfsup/models/heads/contrastive_head.py
+++ b/openselfsup/models/heads/contrastive_head.py
@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+from ..registry import HEADS
+
+
+@HEADS.register_module
+class ContrastiveHead(nn.Module):
+    '''Head for contrastive learning.
+    '''
+
+    def __init__(self, temperature=0.1):
+        super(ContrastiveHead, self).__init__()
+        self.criterion = nn.CrossEntropyLoss()
+        self.temperature = temperature
+
+    def forward(self, pos, neg):
+        '''
+        Args:
+            pos (Tensor): Nx1 positive similarity
+            neg (Tensor): Nxk negative similarity
+        '''
+        N = pos.size(0)
+        logits = torch.cat((pos, neg), dim=1)
+        logits /= self.temperature
+        labels = torch.zeros((N, ), dtype=torch.long).cuda()
+        losses = dict()
+        losses['loss'] = self.criterion(logits, labels)
+        return losses
--- a/openselfsup/models/heads/multi_cls_head.py
+++ b/openselfsup/models/heads/multi_cls_head.py
@ -0,0 +1,77 @@
+import torch.nn as nn
+
+from ..utils import accuracy
+from ..registry import HEADS
+from ..utils import build_norm_layer, MultiPooling
+
+
+@HEADS.register_module
+class MultiClsHead(nn.Module):
+    """Multiple classifier heads.
+    """
+    FEAT_CHANNELS = {'resnet50': [64, 256, 512, 1024, 2048]}
+    FEAT_LAST_UNPOOL = {'resnet50': 2048 * 7 * 7}
+
+    def __init__(self,
+                 pool_type='adaptive',
+                 in_indices=(0, ),
+                 with_last_layer_unpool=False,
+                 backbone='resnet50',
+                 norm_cfg=dict(type='BN'),
+                 num_classes=1000):
+        super(MultiClsHead, self).__init__()
+        assert norm_cfg['type'] in ['BN', 'SyncBN', 'GN', 'null']
+
+        self.with_last_layer_unpool = with_last_layer_unpool
+        self.with_norm = norm_cfg['type'] != 'null'
+
+        self.criterion = nn.CrossEntropyLoss()
+
+        self.multi_pooling = MultiPooling(pool_type, in_indices, backbone)
+
+        if self.with_norm:
+            self.norms = nn.ModuleList([
+                build_norm_layer(norm_cfg, self.FEAT_CHANNELS[backbone][l])[1]
+                for l in in_indices
+            ])
+
+        self.fcs = nn.ModuleList([
+            nn.Linear(self.multi_pooling.POOL_DIMS[backbone][l], num_classes)
+            for l in in_indices
+        ])
+        if with_last_layer_unpool:
+            self.fcs.append(
+                nn.Linear(self.FEAT_LAST_UNPOOL[backbone], num_classes))
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.GroupNorm, nn.SyncBatchNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        assert isinstance(x, (list, tuple))
+        if self.with_last_layer_unpool:
+            last_x = x[-1]
+        x = self.multi_pooling(x)
+        if self.with_norm:
+            x = [n(xx) for n, xx in zip(self.norms, x)]
+        if self.with_last_layer_unpool:
+            x.append(last_x)
+        x = [xx.view(xx.size(0), -1) for xx in x]
+        x = [fc(xx) for fc, xx in zip(self.fcs, x)]
+        return x
+
+    def loss(self, cls_score, labels):
+        losses = dict()
+        for i, s in enumerate(cls_score):
+            # keys must contain "loss"
+            losses['loss.{}'.format(i + 1)] = self.criterion(s, labels)
+            losses['acc.{}'.format(i + 1)] = accuracy(s, labels)
+        return losses
--- a/openselfsup/models/losses/init.py
+++ b/openselfsup/models/losses/init.py
@ -0,0 +1,19 @@
+#from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+#from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+#                                 cross_entropy, mask_cross_entropy)
+#from .focal_loss import FocalLoss, sigmoid_focal_loss
+#from .ghm_loss import GHMC, GHMR
+#from .iou_loss import (BoundedIoULoss, GIoULoss, IoULoss, bounded_iou_loss,
+#                       iou_loss)
+#from .mse_loss import MSELoss, mse_loss
+#from .smooth_l1_loss import SmoothL1Loss, smooth_l1_loss
+#from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+
+#__all__ = [
+#    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+#    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+#    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+#    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+#    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'GHMC', 'GHMR', 'reduce_loss',
+#    'weight_reduce_loss', 'weighted_loss'
+#]
--- a/openselfsup/models/losses/cross_entropy_loss.py
+++ b/openselfsup/models/losses/cross_entropy_loss.py
@ -0,0 +1,103 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..registry import LOSSES
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None):
+    # element-wise losses
+    loss = F.cross_entropy(pred, label, reduction='none')
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_binary_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(labels >= 1).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds] - 1] = 1
+    if label_weights is None:
+        bin_label_weights = None
+    else:
+        bin_label_weights = label_weights.view(-1, 1).expand(
+            label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None):
+    if pred.dim() != label.dim():
+        label, weight = _expand_binary_labels(label, weight, pred.size(-1))
+
+    # weighted element-wise losses
+    if weight is not None:
+        weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(loss, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred, target, label, reduction='mean', avg_factor=None):
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, reduction='mean')[None]
+
+
+@LOSSES.register_module
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
--- a/openselfsup/models/losses/focal_loss.py
+++ b/openselfsup/models/losses/focal_loss.py
@ -0,0 +1,82 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from openselfsup.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+from ..registry import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred, target, gamma, alpha)
+    # TODO: find a proper way to handle the shape of weight
+    if weight is not None:
+        weight = weight.view(-1, 1)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * sigmoid_focal_loss(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
--- a/openselfsup/models/losses/ghm_loss.py
+++ b/openselfsup/models/losses/ghm_loss.py
@ -0,0 +1,171 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..registry import LOSSES
+
+
+def _expand_binary_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(labels >= 1).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds] - 1] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    "Gradient Harmonized Single-stage Detector".
+    https://arxiv.org/abs/1811.05181
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+    """
+
+    def __init__(self, bins=10, momentum=0, use_sigmoid=True, loss_weight=1.0):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, label_weight, *args, **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_binary_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, weights, reduction='sum') / tot
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    "Gradient Harmonized Single-stage Detector"
+    https://arxiv.org/abs/1811.05181
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+    """
+
+    def __init__(self, mu=0.02, bins=10, momentum=0, loss_weight=1.0):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+
+    # TODO: support reduction parameter
+    def forward(self, pred, target, label_weight, avg_factor=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+
+        loss = loss * weights
+        loss = loss.sum() / tot
+        return loss * self.loss_weight
--- a/openselfsup/models/losses/utils.py
+++ b/openselfsup/models/losses/utils.py
@ -0,0 +1,98 @@
+import functools
+
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Avarage factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
--- a/openselfsup/models/memories/init.py
+++ b/openselfsup/models/memories/init.py
@ -0,0 +1,3 @@
+from .odc_memory import ODCMemory
+from .odc_memory_gpu import ODCMemoryGPU
+from .simple_memory import SimpleMemory
--- a/openselfsup/models/memories/odc_memory.py
+++ b/openselfsup/models/memories/odc_memory.py
@ -0,0 +1,217 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class ODCMemory(nn.Module):
+
+    def __init__(self, length, feat_dim, momentum, num_classes, min_cluster,
+                 **kwargs):
+        super(ODCMemory, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        if self.rank == 0:
+            self.feature_bank = torch.zeros((length, feat_dim),
+                                            dtype=torch.float32)
+        self.label_bank = torch.zeros((length, ), dtype=torch.long)
+        self.centroids = torch.zeros((num_classes, feat_dim),
+                                     dtype=torch.float32).cuda()
+        self.kmeans = KMeans(n_clusters=2, random_state=0, max_iter=20)
+        self.feat_dim = feat_dim
+        self.initialized = False
+        self.momentum = momentum
+        self.num_classes = num_classes
+        self.min_cluster = min_cluster
+        self.debug = kwargs.get('debug', False)
+
+    def init_memory(self, feature, label):
+        self.initialized = True
+        self.label_bank.copy_(torch.from_numpy(label).long())
+        # make sure no empty clusters
+        assert (np.bincount(label, minlength=self.num_classes) != 0).all()
+        if self.rank == 0:
+            feature /= (np.linalg.norm(feature, axis=1).reshape(-1, 1) + 1e-10)
+            self.feature_bank.copy_(torch.from_numpy(feature))
+            centroids = self._compute_centroids()
+            self.centroids.copy_(centroids)
+        dist.broadcast(self.centroids, 0)
+
+    def _compute_centroids_ind(self, cinds):
+        '''compute a few centroids'''
+        assert self.rank == 0
+        num = len(cinds)
+        centroids = torch.zeros((num, self.feat_dim), dtype=torch.float32)
+        for i, c in enumerate(cinds):
+            ind = np.where(self.label_bank.numpy() == c)[0]
+            centroids[i, :] = self.feature_bank[ind, :].mean(dim=0)
+        return centroids
+
+    def _compute_centroids(self):
+        '''compute all non-empty centroids'''
+        assert self.rank == 0
+        l = self.label_bank.numpy()
+        argl = np.argsort(l)
+        sortl = l[argl]
+        diff_pos = np.where(sortl[1:] - sortl[:-1] != 0)[0] + 1
+        start = np.insert(diff_pos, 0, 0)
+        end = np.insert(diff_pos, len(diff_pos), len(l))
+        class_start = sortl[start]
+        # keep empty class centroids unchanged
+        centroids = self.centroids.cpu().clone()
+        for i, st, ed in zip(class_start, start, end):
+            centroids[i, :] = self.feature_bank[argl[st:ed], :].mean(dim=0)
+        return centroids
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        #if not hasattr(self, 'ind_gathered'):
+        #    self.ind_gathered = [torch.ones_like(ind).cuda()
+        #                         for _ in range(self.num_replicas)]
+        #if not hasattr(self, 'feature_gathered'):
+        #    self.feature_gathered = [torch.ones_like(feature).cuda()
+        #                             for _ in range(self.num_replicas)]
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
+
+    def update_samples_memory(self, ind, feature):  # ind, feature: cuda tensor
+        assert self.initialized
+        feature_norm = feature / (feature.norm(dim=1).view(-1, 1) + 1e-10
+                                  )  # normalize
+        ind, feature_norm = self._gather(
+            ind, feature_norm)  # ind: (N*w), feature: (N*w)xk, cuda tensor
+        ind = ind.cpu()
+        if self.rank == 0:
+            feature_old = self.feature_bank[ind, ...].cuda()
+            feature_new = (1 - self.momentum) * feature_old + \
+                self.momentum * feature_norm
+            feature_norm = feature_new / (
+                feature_new.norm(dim=1).view(-1, 1) + 1e-10)
+            self.feature_bank[ind, ...] = feature_norm.cpu()
+        dist.barrier()
+        dist.broadcast(feature_norm, 0)
+        # compute new labels
+        similarity_to_centroids = torch.mm(self.centroids,
+                                           feature_norm.permute(1, 0))  # CxN
+        newlabel = similarity_to_centroids.argmax(dim=0)  # cuda tensor
+        newlabel_cpu = newlabel.cpu()
+        change_ratio = (newlabel_cpu !=
+            self.label_bank[ind]).sum().float().cuda() \
+            / float(newlabel_cpu.shape[0])
+        self.label_bank[ind] = newlabel_cpu.clone()  # copy to cpu
+        return change_ratio
+
+    def deal_with_small_clusters(self):
+        # check empty class
+        hist = np.bincount(self.label_bank.numpy(), minlength=self.num_classes)
+        small_clusters = np.where(hist < self.min_cluster)[0].tolist()
+        if self.debug and self.rank == 0:
+            print("mincluster: {}, num of small class: {}".format(
+                hist.min(), len(small_clusters)))
+        if len(small_clusters) == 0:
+            return
+        # re-assign samples in small clusters to make them empty
+        for s in small_clusters:
+            ind = np.where(self.label_bank.numpy() == s)[0]
+            if len(ind) > 0:
+                inclusion = torch.from_numpy(
+                    np.setdiff1d(
+                        np.arange(self.num_classes),
+                        np.array(small_clusters),
+                        assume_unique=True)).cuda()
+                if self.rank == 0:
+                    target_ind = torch.mm(
+                        self.centroids[inclusion, :],
+                        self.feature_bank[ind, :].cuda().permute(
+                            1, 0)).argmax(dim=0)
+                    target = inclusion[target_ind]
+                else:
+                    target = torch.zeros((ind.shape[0], ),
+                                         dtype=torch.int64).cuda()
+                dist.all_reduce(target)
+                self.label_bank[ind] = torch.from_numpy(target.cpu().numpy())
+        # deal with empty cluster
+        self._redirect_empty_clusters(small_clusters)
+
+    def update_centroids_memory(self, cinds=None):
+        if self.rank == 0:
+            if self.debug:
+                print("updating centroids ...")
+            if cinds is None:
+                center = self._compute_centroids()
+                self.centroids.copy_(center)
+            else:
+                center = self._compute_centroids_ind(cinds)
+                self.centroids[
+                    torch.LongTensor(cinds).cuda(), :] = center.cuda()
+        dist.broadcast(self.centroids, 0)
+
+    def _partition_max_cluster(self, max_cluster):
+        assert self.rank == 0
+        max_cluster_inds = np.where(self.label_bank == max_cluster)[0]
+
+        assert len(max_cluster_inds) >= 2
+        max_cluster_features = self.feature_bank[max_cluster_inds, :]
+        if np.any(np.isnan(max_cluster_features.numpy())):
+            raise Exception("Has nan in features.")
+        kmeans_ret = self.kmeans.fit(max_cluster_features)
+        sub_cluster1_ind = max_cluster_inds[kmeans_ret.labels_ == 0]
+        sub_cluster2_ind = max_cluster_inds[kmeans_ret.labels_ == 1]
+        if not (len(sub_cluster1_ind) > 0 and len(sub_cluster2_ind) > 0):
+            print(
+                "Warning: kmeans partition fails, resort to random partition.")
+            sub_cluster1_ind = np.random.choice(
+                max_cluster_inds, len(max_cluster_inds) // 2, replace=False)
+            sub_cluster2_ind = np.setdiff1d(
+                max_cluster_inds, sub_cluster1_ind, assume_unique=True)
+        return sub_cluster1_ind, sub_cluster2_ind
+
+    def _redirect_empty_clusters(self, empty_clusters):
+        for e in empty_clusters:
+            assert (self.label_bank != e).all().item(), \
+                "Cluster #{} is not an empty cluster.".format(e)
+            max_cluster = np.bincount(
+                self.label_bank, minlength=self.num_classes).argmax().item()
+            # gather partitioning indices
+            if self.rank == 0:
+                sub_cluster1_ind, sub_cluster2_ind = self._partition_max_cluster(
+                    max_cluster)
+                size1 = torch.LongTensor([len(sub_cluster1_ind)]).cuda()
+                size2 = torch.LongTensor([len(sub_cluster2_ind)]).cuda()
+                sub_cluster1_ind_tensor = torch.from_numpy(
+                    sub_cluster1_ind).long().cuda()
+                sub_cluster2_ind_tensor = torch.from_numpy(
+                    sub_cluster2_ind).long().cuda()
+            else:
+                size1 = torch.LongTensor([0]).cuda()
+                size2 = torch.LongTensor([0]).cuda()
+            dist.all_reduce(size1)
+            dist.all_reduce(size2)
+            if self.rank != 0:
+                sub_cluster1_ind_tensor = torch.zeros(
+                    (size1, ), dtype=torch.int64).cuda()
+                sub_cluster2_ind_tensor = torch.zeros(
+                    (size2, ), dtype=torch.int64).cuda()
+            dist.broadcast(sub_cluster1_ind_tensor, 0)
+            dist.broadcast(sub_cluster2_ind_tensor, 0)
+            if self.rank != 0:
+                sub_cluster1_ind = sub_cluster1_ind_tensor.cpu().numpy()
+                sub_cluster2_ind = sub_cluster2_ind_tensor.cpu().numpy()
+
+            # reassign samples in partition #2 to the empty class
+            self.label_bank[sub_cluster2_ind] = e
+            # update centroids of max_cluster and e
+            self.update_centroids_memory([max_cluster, e])
--- a/openselfsup/models/memories/odc_memory_gpu.py
+++ b/openselfsup/models/memories/odc_memory_gpu.py
@ -0,0 +1,190 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class ODCMemoryGPU(nn.Module):
+    '''Memory bank for Online Deep Clustering. Feature bank stored in GPU.
+    '''
+
+    def __init__(self, length, feat_dim, momentum, num_classes, min_cluster,
+                 **kwargs):
+        super(ODCMemoryGPU, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        self.feature_bank = torch.zeros((length, feat_dim),
+                                        dtype=torch.float32).cuda()
+        self.label_bank = torch.zeros((length, ), dtype=torch.long).cuda()
+        self.centroids = torch.zeros((num_classes, feat_dim),
+                                     dtype=torch.float32).cuda()
+        self.kmeans = KMeans(n_clusters=2, random_state=0, max_iter=20)
+        self.feat_dim = feat_dim
+        self.initialized = False
+        self.momentum = momentum
+        self.num_classes = num_classes
+        self.min_cluster = min_cluster
+        self.debug = kwargs.get('debug', False)
+
+    @torch.no_grad()
+    def init_memory(self, feature, label):
+        self.initialized = True
+        self.label_bank.copy_(torch.from_numpy(label).long().cuda())
+        # make sure no empty clusters
+        assert (np.bincount(label, minlength=self.num_classes) != 0).all()
+        feature /= (np.linalg.norm(feature, axis=1).reshape(-1, 1) + 1e-10)
+        self.feature_bank.copy_(torch.from_numpy(feature))
+        self._compute_centroids()
+
+    @torch.no_grad()
+    def _compute_centroids_ind(self, cinds):
+        '''compute a few centroids'''
+        for i, c in enumerate(cinds):
+            ind = torch.where(self.label_bank == c)[0]
+            self.centroids[i, :] = self.feature_bank[ind, :].mean(dim=0)
+
+    def _compute_centroids(self):
+        if self.debug:
+            print("enter: _compute_centroids")
+        '''compute all non-empty centroids'''
+        l = self.label_bank.cpu().numpy()
+        argl = np.argsort(l)
+        sortl = l[argl]
+        diff_pos = np.where(sortl[1:] - sortl[:-1] != 0)[0] + 1
+        start = np.insert(diff_pos, 0, 0)
+        end = np.insert(diff_pos, len(diff_pos), len(l))
+        class_start = sortl[start]
+        # keep empty class centroids unchanged
+        for i, st, ed in zip(class_start, start, end):
+            self.centroids[i, :] = self.feature_bank[argl[st:ed], :].mean(
+                dim=0)
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        if self.debug:
+            print("enter: _gather")
+        assert ind.size(0) > 0
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
+
+    def update_samples_memory(self, ind, feature):  # ind, feature: cuda tensor
+        if self.debug:
+            print("enter: update_samples_memory")
+        assert self.initialized
+        feature_norm = feature / (feature.norm(dim=1).view(-1, 1) + 1e-10
+                                  )  # normalize
+        ind, feature_norm = self._gather(
+            ind, feature_norm)  # ind: (N*w), feature: (N*w)xk, cuda tensor
+        # momentum update
+        feature_old = self.feature_bank[ind, ...]
+        feature_new = (1 - self.momentum) * feature_old + \
+            self.momentum * feature_norm
+        feature_norm = feature_new / (
+            feature_new.norm(dim=1).view(-1, 1) + 1e-10)
+        self.feature_bank[ind, ...] = feature_norm
+        # compute new labels
+        similarity_to_centroids = torch.mm(self.centroids,
+                                           feature_norm.permute(1, 0))  # CxN
+        newlabel = similarity_to_centroids.argmax(dim=0)  # cuda tensor
+        change_ratio = (newlabel !=
+            self.label_bank[ind]).sum().float() \
+            / float(newlabel.shape[0])
+        self.label_bank[ind] = newlabel.clone()  # copy to cpu
+        return change_ratio
+
+    @torch.no_grad()
+    def deal_with_small_clusters(self):
+        if self.debug:
+            print("enter: deal_with_small_clusters")
+        # check empty class
+        hist = torch.bincount(self.label_bank, minlength=self.num_classes)
+        small_clusters = torch.where(hist < self.min_cluster)[0]
+        if self.debug and self.rank == 0:
+            print("mincluster: {}, num of small class: {}".format(
+                hist.min(), len(small_clusters)))
+        if len(small_clusters) == 0:
+            return
+        # re-assign samples in small clusters to make them empty
+        for s in small_clusters:
+            ind = torch.where(self.label_bank == s)[0]
+            if len(ind) > 0:
+                inclusion = torch.from_numpy(
+                    np.setdiff1d(
+                        np.arange(self.num_classes),
+                        small_clusters.cpu().numpy(),
+                        assume_unique=True)).cuda()
+                target_ind = torch.mm(self.centroids[inclusion, :],
+                                      self.feature_bank[ind, :].permute(
+                                          1, 0)).argmax(dim=0)
+                target = inclusion[target_ind]
+                self.label_bank[ind] = target
+        # deal with empty cluster
+        self._redirect_empty_clusters(small_clusters)
+
+    def update_centroids_memory(self, cinds=None):
+        if cinds is None:
+            self._compute_centroids()
+        else:
+            self._compute_centroids_ind(cinds)
+
+    def _partition_max_cluster(self, max_cluster):
+        if self.debug:
+            print("enter: _partition_max_cluster")
+        assert self.rank == 0  # avoid randomness among ranks
+        max_cluster_inds = torch.where(self.label_bank == max_cluster)[0]
+        size = len(max_cluster_inds)
+
+        assert size >= 2  # image indices in the max cluster
+        max_cluster_features = self.feature_bank[max_cluster_inds, :]
+        if torch.any(torch.isnan(max_cluster_features)):
+            raise Exception("Has nan in features.")
+        kmeans_ret = self.kmeans.fit(max_cluster_features.cpu().numpy())
+        kmeans_labels = torch.from_numpy(kmeans_ret.labels_).cuda()
+        sub_cluster1_ind = max_cluster_inds[kmeans_labels == 0]
+        sub_cluster2_ind = max_cluster_inds[kmeans_labels == 1]
+        if not (len(sub_cluster1_ind) > 0 and len(sub_cluster2_ind) > 0):
+            print(
+                "Warning: kmeans partition fails, resort to random partition.")
+            rnd_idx = torch.randperm(size)
+            sub_cluster1_ind = max_cluster_inds[rnd_idx[:size // 2]]
+            sub_cluster2_ind = max_cluster_inds[rnd_idx[size // 2:]]
+        return sub_cluster1_ind, sub_cluster2_ind
+
+    def _redirect_empty_clusters(self, empty_clusters):
+        if self.debug:
+            print("enter: _redirect_empty_clusters")
+        for e in empty_clusters:
+            assert (self.label_bank != e).all().item(), \
+                "Cluster #{} is not an empty cluster.".format(e)
+            max_cluster = torch.bincount(
+                self.label_bank, minlength=self.num_classes).argmax().item()
+            # gather partitioning indices
+            if self.rank == 0:
+                sub_cluster1_ind, sub_cluster2_ind = self._partition_max_cluster(
+                    max_cluster)
+                size2 = torch.LongTensor([len(sub_cluster2_ind)]).cuda()
+            else:
+                size2 = torch.LongTensor([0]).cuda()
+            dist.all_reduce(size2)
+            if self.rank != 0:
+                sub_cluster2_ind = torch.zeros((size2, ),
+                                               dtype=torch.int64).cuda()
+            dist.broadcast(sub_cluster2_ind, 0)
+
+            # reassign samples in partition #2 to the empty class
+            self.label_bank[sub_cluster2_ind] = e
+            # update centroids of max_cluster and e
+            self.update_centroids_memory([max_cluster, e])
--- a/openselfsup/models/memories/simple_memory.py
+++ b/openselfsup/models/memories/simple_memory.py
@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from openselfsup.utils import AliasMethod
+
+from ..registry import MEMORIES
+
+
+@MEMORIES.register_module
+class SimpleMemory(nn.Module):
+
+    def __init__(self, length, feat_dim, momentum, **kwargs):
+        super(SimpleMemory, self).__init__()
+        self.rank, self.num_replicas = get_dist_info()
+        self.feature_bank = torch.randn(length, feat_dim).cuda()
+        self.feature_bank = nn.functional.normalize(self.feature_bank)
+        self.momentum = momentum
+        self.multinomial = AliasMethod(torch.ones(length))
+        self.multinomial.cuda()
+
+    def update(self, ind, feature):
+        feature_norm = nn.functional.normalize(feature)
+        ind, feature_norm = self._gather(ind, feature_norm)
+        feature_old = self.feature_bank[ind, ...]
+        feature_new = (1 - self.momentum) * feature_old + \
+            self.momentum * feature_norm
+        feature_new_norm = nn.functional.normalize(feature_new)
+        self.feature_bank[ind, ...] = feature_new_norm
+
+    def _gather(self, ind, feature):  # gather ind and feature
+        ind_gathered = [
+            torch.ones_like(ind).cuda() for _ in range(self.num_replicas)
+        ]
+        feature_gathered = [
+            torch.ones_like(feature).cuda() for _ in range(self.num_replicas)
+        ]
+        dist.all_gather(ind_gathered, ind)
+        dist.all_gather(feature_gathered, feature)
+        ind_gathered = torch.cat(ind_gathered, dim=0)
+        feature_gathered = torch.cat(feature_gathered, dim=0)
+        return ind_gathered, feature_gathered
--- a/openselfsup/models/moco.py
+++ b/openselfsup/models/moco.py
@ -0,0 +1,189 @@
+import torch
+import torch.nn as nn
+
+from openselfsup.utils import print_log
+
+from . import builder
+from .registry import MODELS
+
+
+@MODELS.register_module
+class MOCO(nn.Module):
+    '''MOCO.
+    Part of the code is borrowed from:
+        "https://github.com/facebookresearch/moco/blob/master/moco/builder.py".
+    '''
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 head=None,
+                 pretrained=None,
+                 queue_len=65536,
+                 feat_dim=128,
+                 momentum=0.999,
+                 **kwargs):
+        super(MOCO, self).__init__()
+        self.encoder_q = nn.Sequential(
+            builder.build_backbone(backbone), builder.build_neck(neck))
+        self.encoder_k = nn.Sequential(
+            builder.build_backbone(backbone), builder.build_neck(neck))
+        self.backbone = self.encoder_q[0]
+        for param in self.encoder_k.parameters():
+            param.requires_grad = False
+        self.head = builder.build_head(head)
+        self.init_weights(pretrained=pretrained)
+
+        self.queue_len = queue_len
+        self.momentum = momentum
+
+        # create the queue
+        self.register_buffer("queue", torch.randn(feat_dim, queue_len))
+        self.queue = nn.functional.normalize(self.queue, dim=0)
+        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print_log('load model from: {}'.format(pretrained), logger='root')
+        self.encoder_q[0].init_weights(pretrained=pretrained)
+        self.encoder_q[1].init_weights(init_linear='kaiming')
+        for param_q, param_k in zip(self.encoder_q.parameters(),
+                                    self.encoder_k.parameters()):
+            param_k.data.copy_(param_q.data)
+
+    @torch.no_grad()
+    def _momentum_update_key_encoder(self):
+        """
+        Momentum update of the key encoder
+        """
+        for param_q, param_k in zip(self.encoder_q.parameters(),
+                                    self.encoder_k.parameters()):
+            param_k.data = param_k.data * self.momentum + \
+                           param_q.data * (1. - self.momentum)
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, keys):
+        # gather keys before updating queue
+        keys = concat_all_gather(keys)
+
+        batch_size = keys.shape[0]
+
+        ptr = int(self.queue_ptr)
+        assert self.queue_len % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.queue[:, ptr:ptr + batch_size] = keys.transpose(0, 1)
+        ptr = (ptr + batch_size) % self.queue_len  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    @torch.no_grad()
+    def _batch_shuffle_ddp(self, x):
+        """
+        Batch shuffle, for making use of BatchNorm.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # random shuffle index
+        idx_shuffle = torch.randperm(batch_size_all).cuda()
+
+        # broadcast to all gpus
+        torch.distributed.broadcast(idx_shuffle, src=0)
+
+        # index for restoring
+        idx_unshuffle = torch.argsort(idx_shuffle)
+
+        # shuffled index for this gpu
+        gpu_idx = torch.distributed.get_rank()
+        idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]
+
+        return x_gather[idx_this], idx_unshuffle
+
+    @torch.no_grad()
+    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
+        """
+        Undo batch shuffle.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # restored index for this gpu
+        gpu_idx = torch.distributed.get_rank()
+        idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]
+
+        return x_gather[idx_this]
+
+    def forward_train(self, img, **kwargs):
+        assert img.dim() == 5, \
+            "Input must have 5 dims, got: {}".format(img.dim())
+        im_q = img[:, 0, ...].contiguous()
+        im_k = img[:, 1, ...].contiguous()
+        # compute query features
+        q = self.encoder_q(im_q)[0]  # queries: NxC
+        q = nn.functional.normalize(q, dim=1)
+
+        # compute key features
+        with torch.no_grad():  # no gradient to keys
+            self._momentum_update_key_encoder()  # update the key encoder
+
+            # shuffle for making use of BN
+            im_k, idx_unshuffle = self._batch_shuffle_ddp(im_k)
+
+            k = self.encoder_k(im_k)[0]  # keys: NxC
+            k = nn.functional.normalize(k, dim=1)
+
+            # undo shuffle
+            k = self._batch_unshuffle_ddp(k, idx_unshuffle)
+
+        # compute logits
+        # Einstein sum is more intuitive
+        # positive logits: Nx1
+        l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
+        # negative logits: NxK
+        l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])
+
+        losses = self.head(l_pos, l_neg)
+        self._dequeue_and_enqueue(k)
+
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        pass
+
+    def forward(self, img, mode='train', **kwargs):
+        if mode == 'train':
+            return self.forward_train(img, **kwargs)
+        elif mode == 'test':
+            return self.forward_test(img, **kwargs)
+        elif mode == 'extract':
+            return self.encoder_q[0](img)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+
+# utils
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [
+        torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+    output = torch.cat(tensors_gather, dim=0)
+    return output
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`from .train import get_root_logger, set_random_seed, train_model`