Merge pull request #9 from open-mmlab/master

update
2025-06-03 14:59:38 +08:00 · 2020-11-25 20:25:15 +08:00 · 2020-11-25 20:25:15 +08:00 · e5ddb3f9ca
commit e5ddb3f9ca
parent 1f958ad1f9 83bda580f8
15 changed files with 292 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -36,7 +36,7 @@ Below is the relations among Unsupervised Learning, Self-Supervised Learning and
 <tr><td><a href="https://arxiv.org/abs/1911.05722" target="_blank" rel="noopener noreferrer">MoCo</a></td><td>79.18</td><td>60.60</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2003.04297" target="_blank" rel="noopener noreferrer">MoCo v2</a></td><td>84.26</td><td>67.69</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a></td><td>78.95</td><td>61.57</td></tr>
-<tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL (bs4096)</a></td><td>85.10</td><td>69.14</td></tr>
+<tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL (epoch=300)</a></td><td>86.58</td><td>72.35</td></tr>
 </tbody></table>
 - **Flexibility & Extensibility**
@ -68,6 +68,8 @@ Below is the relations among Unsupervised Learning, Self-Supervised Learning and
 Please refer to [CHANGELOG.md](docs/CHANGELOG.md) for details and release history.
 [2020-10-14] `OpenSelfSup` v0.3.0 is released with some bugs fixed and support of new features.
 [2020-06-26] `OpenSelfSup` v0.2.0 is released with benchmark results and support of new features.
 [2020-06-16] `OpenSelfSup` v0.1.0 is released.
@ -88,6 +90,20 @@ Please refer to [MODEL_ZOO.md](docs/MODEL_ZOO.md) for for a comprehensive set of
 This project is released under the [Apache 2.0 license](LICENSE).
 ## Citation
 If you use this toolbox in your research, please consider cite:
 ```
@inproceedings{zhan2020online,
  title={Online Deep Clustering for Unsupervised Representation Learning},
  author={Zhan, Xiaohang and Xie, Jiahao and Liu, Ziwei and Ong, Yew-Soon and Loy, Chen Change},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={6688--6697},
  year={2020}
 }
 ```
 ## Acknowledgement
 - This repo borrows the architecture design and part of the code from [MMDetection](https://github.com/open-mmlab/mmdetection).
@ -96,6 +112,10 @@ This project is released under the [Apache 2.0 license](LICENSE).
 fair_self_supervision_benchmark](https://github.com/facebookresearch/fair_self_supervision_benchmark).
 - `openselfsup/third_party/clustering.py` is borrowed from [deepcluster](https://github.com/facebookresearch/deepcluster/blob/master/clustering.py).
 ## Contributors
 We encourage researchers interested in Self-Supervised Learning to contribute to OpenSelfSup. Your contributions, including implementing or transferring new methods to OpenSelfSup, performing experiments, reproducing of results, parameter studies, etc, will be recorded in [MODEL_ZOO.md](docs/MODEL_ZOO.md). For now, the contributors include: Xiaohang Zhan ([@XiaohangZhan](http://github.com/XiaohangZhan)), Jiahao Xie ([@Jiahao000](https://github.com/Jiahao000)), Enze Xie ([@xieenze](https://github.com/xieenze)), Zijian He ([@scnuhealthy](https://github.com/scnuhealthy)).
 ## Contact
 This repo is currently maintained by Xiaohang Zhan ([@XiaohangZhan](http://github.com/XiaohangZhan)), Jiahao Xie ([@Jiahao000](https://github.com/Jiahao000)) and Enze Xie ([@xieenze](https://github.com/xieenze)).
--- a/configs/selfsup/byol/r50_bs256_accumulate16_ep300.py
+++ b/configs/selfsup/byol/r50_bs256_accumulate16_ep300.py
@ -0,0 +1,108 @@
 import copy
 _base_ = '../../base.py'
 # model settings
 model = dict(
    type='BYOL',
    pretrained=None,
    base_momentum=0.99,
    pre_conv=True,
    backbone=dict(
        type='ResNet',
        depth=50,
        in_channels=3,
        out_indices=[4],  # 0: conv-1, x: stage-x
        norm_cfg=dict(type='SyncBN')),
    neck=dict(
        type='NonLinearNeckSimCLR',
        in_channels=2048,
        hid_channels=4096,
        out_channels=256,
        num_layers=2,
        sync_bn=True,
        with_bias=True,
        with_last_bn=False,
        with_avg_pool=True),
    head=dict(type='LatentPredictHead',
              size_average=True,
              predictor=dict(type='NonLinearNeckSimCLR',
                             in_channels=256, hid_channels=4096,
                             out_channels=256, num_layers=2, sync_bn=True,
                             with_bias=True, with_last_bn=False, with_avg_pool=False)))
 # dataset settings
 data_source_cfg = dict(
    type='ImageNet',
    memcached=True,
    mclient_path='/mnt/lustre/share/memcached_client')
 data_train_list = 'data/imagenet/meta/train.txt'
 data_train_root = 'data/imagenet/train'
 dataset_type = 'BYOLDataset'
 img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 train_pipeline = [
    dict(type='RandomResizedCrop', size=224, interpolation=3),
    dict(type='RandomHorizontalFlip'),
    dict(
        type='RandomAppliedTrans',
        transforms=[
            dict(
                type='ColorJitter',
                brightness=0.4,
                contrast=0.4,
                saturation=0.2,
                hue=0.1)
        ],
        p=0.8),
    dict(type='RandomGrayscale', p=0.2),
    dict(
        type='RandomAppliedTrans',
        transforms=[
            dict(
                type='GaussianBlur',
                sigma_min=0.1,
                sigma_max=2.0)
        ],
        p=1.),
    dict(type='RandomAppliedTrans',
         transforms=[dict(type='Solarization')], p=0.),
    dict(type='ToTensor'),
    dict(type='Normalize', **img_norm_cfg),
 ]
 train_pipeline1 = copy.deepcopy(train_pipeline)
 train_pipeline2 = copy.deepcopy(train_pipeline)
 train_pipeline2[4]['p'] = 0.1 # gaussian blur
 train_pipeline2[5]['p'] = 0.2 # solarization
 data = dict(
    imgs_per_gpu=32,  # total 32*8(gpu)*16(interval)=4096
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_source=dict(
            list_file=data_train_list, root=data_train_root,
            **data_source_cfg),
        pipeline1=train_pipeline1,
        pipeline2=train_pipeline2))
 # additional hooks
 update_interval = 16  # interval for accumulate gradient
 custom_hooks = [
    dict(type='BYOLHook', end_momentum=1., update_interval=update_interval)
 ]
 # optimizer
 optimizer = dict(type='LARS', lr=4.8, weight_decay=0.000001, momentum=0.9,
                 paramwise_options={
                    '(bn|gn)(\d+)?.(weight|bias)': dict(weight_decay=0., lars_exclude=True),
                    'bias': dict(weight_decay=0., lars_exclude=True)})
 # apex
 use_fp16 = False
 optimizer_config = dict(update_interval=update_interval, use_fp16=use_fp16)
 # learning policy
 lr_config = dict(
    policy='CosineAnnealing',
    min_lr=0.,
    warmup='linear',
    warmup_iters=10,
    warmup_ratio=0.0001, # cannot be 0
    warmup_by_epoch=True)
 checkpoint_config = dict(interval=10)
 # runtime settings
 total_epochs = 300
--- a/configs/selfsup/deepcluster/r50_withoutsobel.py
+++ b/configs/selfsup/deepcluster/r50_withoutsobel.py
@ -0,0 +1,88 @@
 _base_ = '../../base.py'
 # model settings
 num_classes = 10000
 model = dict(
    type='DeepCluster',
    pretrained=None,
    with_sobel=False,
    backbone=dict(
        type='ResNet',
        depth=50,
        in_channels=3,
        out_indices=[4],  # 0: conv-1, x: stage-x
        norm_cfg=dict(type='BN')),
    neck=dict(type='AvgPoolNeck'),
    head=dict(
        type='ClsHead',
        with_avg_pool=False,  # already has avgpool in the neck
        in_channels=2048,
        num_classes=num_classes))
 # dataset settings
 data_source_cfg = dict(
    type='ImageNet',
    memcached=True,
    mclient_path='/mnt/lustre/share/memcached_client')
 data_train_list = 'data/imagenet/meta/train.txt'
 data_train_root = 'data/imagenet/jpeg/train'
 dataset_type = 'DeepClusterDataset'
 img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 train_pipeline = [
    dict(type='RandomResizedCrop', size=224),
    dict(type='RandomHorizontalFlip'),
    dict(type='RandomRotation', degrees=2),
    dict(
        type='ColorJitter',
        brightness=0.4,
        contrast=0.4,
        saturation=1.0,
        hue=0.5),
    dict(type='RandomGrayscale', p=0.2),
    dict(type='ToTensor'),
    dict(type='Normalize', **img_norm_cfg),
 ]
 extract_pipeline = [
    dict(type='Resize', size=256),
    dict(type='CenterCrop', size=224),
    dict(type='ToTensor'),
    dict(type='Normalize', **img_norm_cfg),
 ]
 data = dict(
    imgs_per_gpu=64,    # 32
    workers_per_gpu=4,
    train=dict(
        type=dataset_type,
        data_source=dict(
            list_file=data_train_list, root=data_train_root,
            **data_source_cfg),
        pipeline=train_pipeline))
 # additional hooks
 custom_hooks = [
    dict(
        type='DeepClusterHook',
        extractor=dict(
            imgs_per_gpu=128,
            workers_per_gpu=8,
            dataset=dict(
                type=dataset_type,
                data_source=dict(
                    list_file=data_train_list,
                    root=data_train_root,
                    **data_source_cfg),
                pipeline=extract_pipeline)),
        clustering=dict(type='Kmeans', k=num_classes, pca_dim=256),
        unif_sampling=True,
        reweight=False,
        reweight_pow=0.5,
        initial=True,  # call initially
        interval=1)
 ]
 # optimizer
 optimizer = dict(
    type='SGD', lr=0.3, momentum=0.9, weight_decay=0.00001,
    nesterov=False,
    paramwise_options={'\Ahead.': dict(momentum=0.)})
 # learning policy
 lr_config = dict(policy='step', step=[400])
 checkpoint_config = dict(interval=10)
 # runtime settings
 total_epochs = 200
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -1,5 +1,21 @@
 ## Changelog
 ### v0.3.0 (14/10/2020)
 #### Highlight
 * Support Mixed Precision Training
 * Improvement of GaussianBlur doubles the training speed
 * More benchmarking results
 #### Bug Fixes
 * Fix bugs in moco v2, now the results are reproducible.
 * Fix bugs in byol.
 #### New Features
 * Mixed Precision Training
 * Improvement of GaussianBlur doubles the training speed of MoCo V2, SimCLR, BYOL
 * More benchmarking results, including Places, VOC, COCO
 ### v0.2.0 (26/6/2020)
 #### Highlights
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@ -38,6 +38,12 @@ An example:
 SRUN_ARGS="-w xx.xx.xx.xx" bash tools/srun_train.sh Dummy configs/selfsup/odc/r50_v1.py 8 --resume_from work_dirs/selfsup/odc/r50_v1/epoch_100.pth
 ```
 ### Train with multiple machines
 If you launch with multiple machines simply connected with ethernet, you have to modify `tools/dist_train.sh` or create a new script, please refer to PyTorch [Launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility). Usually it is slow if you do not have high speed networking like InfiniBand.
 If you launch with slurm, the command is the same as that on single machine described above. You only need to change ${GPUS}, e.g., to 16 for two 8-GPU machines.
 ### Launch multiple jobs on a single machine
 If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@ -14,7 +14,7 @@ We have tested the following versions of OS and softwares:
 - OS: Ubuntu 16.04/18.04 and CentOS 7.2
 - CUDA: 9.0/9.2/10.0/10.1
- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2
+- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2 (PyTorch-1.1 w/ NCCL-2.4.2 has a deadlock bug, see [here](https://github.com/open-mmlab/OpenSelfSup/issues/6))
 - GCC(G++): 4.9/5.3/5.4/7.3
 ### Install openselfsup
--- a/docs/MODEL_ZOO.md
+++ b/docs/MODEL_ZOO.md
@ -1,24 +1,29 @@
 # Model Zoo
 **OpenSelfSup needs your contribution!
 Since we don't have sufficient GPUs to run these large-scale experiments, your contributions, including parameter studies, reproducing of results, implementing new methods, etc, are essential to make OpenSelfSup better. Your contribution will be recorded in the below table, top contributors will be included in the author list of OpenSelfSup!**
 ## Pre-trained model download links and speed test.
 **Note**
-* The testing GPUs are NVIDIA Tesla V100.
+* If not specifically indicated, the testing GPUs are NVIDIA Tesla V100.
 * Experiments with the same batch size are directly comparable in speed.
 * The table records the implementors who implemented the methods (either by themselves or refactoring from other repos), and the experimenters who performed experiments and reproduced the results. The experimenters should be responsible for the evaluation results on all the benchmarks, and the implementors should be responsible for the implementation as well as the results; If the experimenter is not indicated, an implementator is the experimenter by default.
-<table><thead><tr><th>Method</th><th>Config</th><th>Remarks</th><th>Download link</th><th>Batch size</th><th>Epochs</th><th>Time per epoch</th></tr></thead><tbody>
+<table><thead><tr><th>Method (Implementator)</th><th>Config (Experimenter)</th><th>Remarks</th><th>Download link</th><th>Batch size</th><th>Epochs</th><th>Time per epoch</th></tr></thead><tbody>
 <tr><td><a href="https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py" target="_blank" rel="noopener noreferrer">ImageNet</a></td><td>-</td><td>torchvision</td><td><a href="https://drive.google.com/file/d/11xA3TOcbD0qOrwpBfYonEDeseE1wMfBh/view?usp=sharing" target="_blank" rel="noopener noreferrer">imagenet_r50-21352794.pth</a></td><td>-</td><td>-</td><td>-</td></tr>
 <tr><td>Random</td><td>-</td><td>kaiming</td><td><a href="https://drive.google.com/file/d/1UaFTjd6sbKkZEE-f58Zv30bnx7C1qJBb/view?usp=sharing" target="_blank" rel="noopener noreferrer">random_r50-5d0fa71b.pth</a></td><td>-</td><td>-</td><td>-</td></tr>
-<tr><td><a href="https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.pdf" target="_blank" rel="noopener noreferrer">Relative-Loc</a></td><td>selfsup/relative_loc/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ibk1BI3PFQxZqcxuDfHs3n7JnWKCgl8x/view?usp=sharing" target="_blank" rel="noopener noreferrer">relative_loc_r50-342c9097.pth</a></td><td>512</td><td>70</td><td>21min17s</td></tr>
+<tr><td><a href="https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Doersch_Unsupervised_Visual_Representation_ICCV_2015_paper.pdf" target="_blank" rel="noopener noreferrer">Relative-Loc</a> (<a href="https://github.com/Jiahao000">@Jiahao000</a>)</td><td>selfsup/relative_loc/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ibk1BI3PFQxZqcxuDfHs3n7JnWKCgl8x/view?usp=sharing" target="_blank" rel="noopener noreferrer">relative_loc_r50-342c9097.pth</a></td><td>512</td><td>70</td><td>21min17s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/1803.07728" target="_blank" rel="noopener noreferrer">Rotation-Pred</a></td><td>selfsup/rotation_pred/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1t3oClmIvQ0p8RZ0V5yvQFltzjqBO823Y/view?usp=sharing" target="_blank" rel="noopener noreferrer">rotation_r50-cfab8ebb.pth</a></td><td>128</td><td>70</td><td>49min58s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/1803.07728" target="_blank" rel="noopener noreferrer">Rotation-Pred</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/rotation_pred/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1t3oClmIvQ0p8RZ0V5yvQFltzjqBO823Y/view?usp=sharing" target="_blank" rel="noopener noreferrer">rotation_r50-cfab8ebb.pth</a></td><td>128</td><td>70</td><td>49min58s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/1807.05520" target="_blank" rel="noopener noreferrer">DeepCluster</a></td><td>selfsup/deepcluster/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1GxgP7pI18JtFxDIC0hnHOanvUYajoLlg/view?usp=sharing" target="_blank" rel="noopener noreferrer">deepcluster_r50-bb8681e2.pth</a></td><td>512</td><td>200</td><td>41min57s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/1807.05520" target="_blank" rel="noopener noreferrer">DeepCluster</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/deepcluster/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1GxgP7pI18JtFxDIC0hnHOanvUYajoLlg/view?usp=sharing" target="_blank" rel="noopener noreferrer">deepcluster_r50-bb8681e2.pth</a></td><td>512</td><td>200</td><td>41min57s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/1805.01978" target="_blank" rel="noopener noreferrer">NPID</a></td><td>selfsup/npid/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1sm6I3Y5XnCWdbmeLSF4YupUtPe5nRQMI/view?usp=sharing" target="_blank" rel="noopener noreferrer">npid_r50-dec3df0c.pth</a></td><td>256</td><td>200</td><td>20min5s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/1805.01978" target="_blank" rel="noopener noreferrer">NPID</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/npid/r50.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1sm6I3Y5XnCWdbmeLSF4YupUtPe5nRQMI/view?usp=sharing" target="_blank" rel="noopener noreferrer">npid_r50-dec3df0c.pth</a></td><td>256</td><td>200</td><td>20min5s</td></tr>
 <tr><td></td><td>selfsup/npid/r50_ensure_neg.py</td><td>ensure_neg=True</td><td><a href="https://drive.google.com/file/d/1FldDrb6kzF3CZ7737mwCXVI6HE2aCSaF/view?usp=sharing" target="_blank" rel="noopener noreferrer">npid_r50_ensure_neg-ce09b7ae.pth</a></td><td></td><td></td><td></td></tr>
-<tr><td><a href="http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhan_Online_Deep_Clustering_for_Unsupervised_Representation_Learning_CVPR_2020_paper.pdf" target="_blank" rel="noopener noreferrer">ODC</a></td><td>selfsup/odc/r50_v1.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1EdhJeZAyMsD_pEW7uMhLzos5xZLdariN/view?usp=sharing" target="_blank" rel="noopener noreferrer">odc_r50_v1-5af5dd0c.pth</a></td><td>512</td><td>440</td><td>28min22s</td></tr>
+<tr><td><a href="http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhan_Online_Deep_Clustering_for_Unsupervised_Representation_Learning_CVPR_2020_paper.pdf" target="_blank" rel="noopener noreferrer">ODC</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/odc/r50_v1.py (<a href="https://github.com/Jiahao000">@Jiahao000</a>)</td><td>default</td><td><a href="https://drive.google.com/file/d/1EdhJeZAyMsD_pEW7uMhLzos5xZLdariN/view?usp=sharing" target="_blank" rel="noopener noreferrer">odc_r50_v1-5af5dd0c.pth</a></td><td>512</td><td>440</td><td>28min22s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/1911.05722" target="_blank" rel="noopener noreferrer">MoCo</a></td><td>selfsup/moco/r50_v1.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ANXfnoT8yBQQBBqR_kQLQorK20l65KMy/view?usp=sharing" target="_blank" rel="noopener noreferrer">moco_r50_v1-4ad89b5c.pth</a></td><td>256</td><td>200</td><td>22min58s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/1911.05722" target="_blank" rel="noopener noreferrer">MoCo</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/moco/r50_v1.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ANXfnoT8yBQQBBqR_kQLQorK20l65KMy/view?usp=sharing" target="_blank" rel="noopener noreferrer">moco_r50_v1-4ad89b5c.pth</a></td><td>256</td><td>200</td><td>22min58s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/2003.04297" target="_blank" rel="noopener noreferrer">MoCo v2</a></td><td>selfsup/moco/r50_v2.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ImO8A3uWbrTx21D1IqBDMUQvpN6wmv0d/view?usp=sharing" target="_blank" rel="noopener noreferrer">moco_r50_v2-e3b0c442.pth</a></td><td>256</td><td>200</td><td>55min43s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/2003.04297" target="_blank" rel="noopener noreferrer">MoCo v2</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/moco/r50_v2.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1ImO8A3uWbrTx21D1IqBDMUQvpN6wmv0d/view?usp=sharing" target="_blank" rel="noopener noreferrer">moco_r50_v2-e3b0c442.pth</a></td><td>256</td><td>200</td><td>25min7s</td></tr>
-<tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a></td><td>selfsup/simclr/r50_bs256_ep200.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1aZ43nSdivdNxHbM9DKVoZYVhZ8TNnmPp/view?usp=sharing" target="_blank" rel="noopener noreferrer">simclr_r50_bs256_ep200-4577e9a6.pth</a></td><td>256</td><td>200</td><td>1h1min7s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/simclr/r50_bs256_ep200.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1aZ43nSdivdNxHbM9DKVoZYVhZ8TNnmPp/view?usp=sharing" target="_blank" rel="noopener noreferrer">simclr_r50_bs256_ep200-4577e9a6.pth</a></td><td>256</td><td>200</td><td>32min13s</td></tr>
 <tr><td></td><td>selfsup/simclr/r50_bs256_ep200_mocov2_neck.py</td><td>-&gt; MoCo v2 neck</td><td><a href="https://drive.google.com/file/d/1AXpSKqgWfnj6jCgN65BXSTCKFfuIVELa/view?usp=sharing" target="_blank" rel="noopener noreferrer">simclr_r50_bs256_ep200_mocov2_neck-0d6e5ff2.pth</a></td><td></td><td></td><td></td></tr>
-<tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL</a></td><td>selfsup/byol/r50_bs4096_ep200.py</td><td>default</td><td><a href="https://drive.google.com/file/d/1Whj3j5E3ShQj_VufjrJSzWiq1xcZZCXN/view?usp=sharing" target="_blank" rel="noopener noreferrer">byol_r50-e3b0c442.pth</a></td><td>4096</td><td>200</td><td>14min40s</td></tr>
+<tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL</a> (<a href="https://github.com/XiaohangZhan">@XiaohangZhan</a>)</td><td>selfsup/byol/r50_bs4096_ep200.py (<a href="https://github.com/xieenze">@xieenze</a>)</td><td>default</td><td><a href="https://drive.google.com/file/d/1Whj3j5E3ShQj_VufjrJSzWiq1xcZZCXN/view?usp=sharing" target="_blank" rel="noopener noreferrer">byol_r50-e3b0c442.pth</a></td><td>4096 (256 for speed test)</td><td>200</td><td>30min57s</td></tr>
 <tr><td></td><td>selfsup/byol/r50_bs256_accumulate16_ep300.py (<a href="https://github.com/scnuhealthy">@scnuhealthy</a>)</td><td>default</td><td><a href="https://drive.google.com/file/d/12Zu9r3fE8qKF4OW6WQXa5Ec6VuA2m3j7/view?usp=sharing" target="_blank" rel="noopener noreferrer">byol_r50_bs256_accmulate16_ep300-5df46722.pth</a></td><td>256</td><td>300</td><td></td></tr>
 </tbody></table>
 ## Benchmarks
@ -40,8 +45,10 @@
 <tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a></td><td>selfsup/simclr/r50_bs256_ep200.py</td><td>default</td><td>feat5</td><td>78.95</td><td>32.45</td><td>40.76</td><td>50.4</td><td>59.01</td><td>65.45</td><td>70.13</td><td>73.58</td><td>75.35</td></tr>
 <tr><td></td><td>selfsup/simclr/r50_bs256_ep200_mocov2_neck.py</td><td>-&gt; MoCo v2 neck</td><td>feat5</td><td>77.65</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>
 <tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL</a></td><td>selfsup/byol/r50_bs4096_ep200.py</td><td>default</td><td>feat5</td><td>85.10</td><td>44.48</td><td>52.09</td><td>62.88</td><td>70.87</td><td>76.18</td><td>79.45</td><td>81.88</td><td>83.08</td></tr>
 <tr><td></td><td>selfsup/byol/r50_bs256_accumulate16_ep300.py</td><td>default</td><td>feat5</td><td>86.58</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr>
 </tbody></table>
 ### ImageNet Linear Classification
 **Note**
@ -49,6 +56,7 @@
 * For DeepCluster, use the corresponding one with `_sobel`.
 * ImageNet (Multi) evaluates features in around 9k dimensions from different layers. Top-1 result of the last epoch is reported.
 * ImageNet (Last) evaluates the last feature after global average pooling, e.g., 2048 dimensions for resnet50. The best top-1 result among all epochs is reported.
 * Usually, we report the best result from ImageNet (Multi) and ImageNet (Last) to ensure fairness, since different methods achieve their best performance on different layers.
 <table><thead><tr><th rowspan="2">Method</th><th rowspan="2">Config</th><th rowspan="2">Remarks</th><th colspan="5">ImageNet (Multi)</th><th>ImageNet (Last)</th></tr>
 <tr><td>feat1</td><td>feat2</td><td>feat3</td><td>feat4</td><td>feat5</td><td>avgpool</td></tr></thead><tbody>
@ -64,6 +72,7 @@
 <tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a></td><td>selfsup/simclr/r50_bs256_ep200.py</td><td>default</td><td>17.09</td><td>31.37</td><td>41.38</td><td>54.35</td><td>61.57</td><td>60.06</td></tr>
 <tr><td></td><td>selfsup/simclr/r50_bs256_ep200_mocov2_neck.py</td><td>-&gt; MoCo v2 neck</td><td>16.97</td><td>31.88</td><td>41.73</td><td>54.33</td><td>59.94</td><td>58.00</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL</a></td><td>selfsup/byol/r50_bs4096_ep200.py</td><td>default</td><td>16.70</td><td>34.22</td><td>46.61</td><td>60.78</td><td>69.14</td><td>67.10</td></tr>
 <tr><td></td><td>selfsup/byol/r50_bs256_accumulate16_ep300.py</td><td>default</td><td>14.07</td><td>34.44</td><td>47.22</td><td>63.08</td><td>72.35</td><td></td></tr>
 </tbody></table>
 ### Places205 Linear Classification
@ -125,7 +134,7 @@
 <tr><td><a href="https://arxiv.org/abs/1911.05722" target="_blank" rel="noopener noreferrer">MoCo</a></td><td>selfsup/moco/r50_v1.py</td><td>default</td><td>r50_lr0_01_head100.py</td><td>60.08</td><td>84.02</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2003.04297" target="_blank" rel="noopener noreferrer">MoCo v2</a></td><td>selfsup/moco/r50_v2.py</td><td>default</td><td>r50_lr0_01_head100.py</td><td>61.80</td><td>85.11</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2002.05709" target="_blank" rel="noopener noreferrer">SimCLR</a></td><td>selfsup/simclr/r50_bs256_ep200.py</td><td>default</td><td>r50_lr0_01_head100.py</td><td>58.46</td><td>82.60</td></tr>
-<tr><td></td><td>selfsup/simclr/r50_bs256_ep200_mocov2_neck.py</td><td>-&gt; MoCo v2 neck</td><td></td><td>58.38</td><td>82.53</td></tr>
+<tr><td></td><td>selfsup/simclr/r50_bs256_ep200_mocov2_neck.py</td><td>-&gt; MoCo v2 neck</td><td>r50_lr0_01_head100.py</td><td>58.38</td><td>82.53</td></tr>
 <tr><td><a href="https://arxiv.org/abs/2006.07733" target="_blank" rel="noopener noreferrer">BYOL</a></td><td>selfsup/byol/r50_bs4096_ep200.py</td><td>default</td><td>r50_lr0_01_head100.py</td><td>65.94</td><td>87.81</td></tr>
 </tbody></table>
@ -169,4 +178,3 @@
 <tr><td><a href="https://arxiv.org/abs/2006.07733">BYOL</a></td><td>selfsup/byol/r50_bs4096_ep200.py</td><td>default</td><td>60.5</td><td>40.3</td><td>43.9</td><td>56.8</td><td>35.1</td><td>37.3</td></tr>
 </tbody></table>
--- a/openselfsup/datasets/contrastive.py
+++ b/openselfsup/datasets/contrastive.py
@ -24,5 +24,5 @@ class ContrastiveDataset(BaseDataset):
        img_cat = torch.cat((img1.unsqueeze(0), img2.unsqueeze(0)), dim=0)
        return dict(img=img_cat)
-    def evaluate(self, scores, keyword, logger=None):
+    def evaluate(self, scores, keyword, logger=None, **kwargs):
        raise NotImplemented
--- a/openselfsup/datasets/data_sources/cifar.py
+++ b/openselfsup/datasets/data_sources/cifar.py
@ -42,7 +42,7 @@ class Cifar100(object):
        assert split in ['train', 'test']
        try:
            self.cifar = CIFAR100(
-                root=root, train=spilt == 'train', download=False)
+                root=root, train=split == 'train', download=False)
        except:
            raise Exception("Please download CIFAR10 manually, \
                  in case of downloading the dataset parallelly \
--- a/openselfsup/datasets/pipelines/transforms.py
+++ b/openselfsup/datasets/pipelines/transforms.py
@ -12,7 +12,9 @@ from openselfsup.utils import build_from_cfg
 from ..registry import PIPELINES
 # register all existing transforms in torchvision
 _EXCLUDED_TRANSFORMS = ['GaussianBlur']
 for m in inspect.getmembers(_transforms, inspect.isclass):
    if m[0] not in _EXCLUDED_TRANSFORMS:
        PIPELINES.register_module(m[1])
--- a/openselfsup/hooks/byol_hook.py
+++ b/openselfsup/hooks/byol_hook.py
@ -1,5 +1,6 @@
 from math import cos, pi
 from mmcv.runner import Hook
 from mmcv.parallel import is_module_wrapper
 from .registry import HOOKS
@ -17,17 +18,26 @@ class BYOLHook(Hook):
            for the target network. Default: 1.
    """
-    def __init__(self, end_momentum=1., **kwargs):
+    def __init__(self, end_momentum=1., update_interval=1, **kwargs):
        self.end_momentum = end_momentum
        self.update_interval = update_interval
    def before_train_iter(self, runner):
        assert hasattr(runner.model.module, 'momentum'), \
            "The runner must have attribute \"momentum\" in BYOLHook."
        assert hasattr(runner.model.module, 'base_momentum'), \
            "The runner must have attribute \"base_momentum\" in BYOLHook."
        if self.every_n_iters(runner, self.update_interval):
            cur_iter = runner.iter
            max_iter = runner.max_iters
            base_m = runner.model.module.base_momentum
            m = self.end_momentum - (self.end_momentum - base_m) * (
                cos(pi * cur_iter / float(max_iter)) + 1) / 2
            runner.model.module.momentum = m
    def after_train_iter(self, runner):
        if self.every_n_iters(runner, self.update_interval):
            if is_module_wrapper(runner.model):
                runner.model.module.momentum_update()
            else:
                runner.model.momentum_update()
--- a/openselfsup/models/byol.py
+++ b/openselfsup/models/byol.py
@ -70,6 +70,10 @@ class BYOL(nn.Module):
            param_tgt.data = param_tgt.data * self.momentum + \
                             param_ol.data * (1. - self.momentum)
    @torch.no_grad()
    def momentum_update(self):
        self._momentum_update()
    def forward_train(self, img, **kwargs):
        """Forward computation during training.
@ -93,7 +97,6 @@ class BYOL(nn.Module):
        loss = self.head(proj_online_v1, proj_target_v2)['loss'] + \
               self.head(proj_online_v2, proj_target_v1)['loss']
        self._momentum_update()
        return dict(loss=loss)
    def forward_test(self, img, **kwargs):
--- a/openselfsup/third_party/clustering.py
+++ b/openselfsup/third_party/clustering.py
@ -5,6 +5,7 @@ import time
 import numpy as np
 import faiss
 import torch
 from scipy.sparse import csr_matrix
 __all__ = ['Kmeans', 'PIC']
--- a/openselfsup/version.py
+++ b/openselfsup/version.py
@ -1,5 +1,5 @@
 # GENERATED VERSION FILE
-# TIME: Mon Sep  7 13:47:34 2020
+# TIME: Wed Oct 21 05:59:41 2020
-__version__ = '0.2.0+d1b12bd'
+__version__ = '0.3.0+df3689d'
-short_version = '0.2.0'
+short_version = '0.3.0'
--- a/setup.py
+++ b/setup.py
@ -12,7 +12,7 @@ def readme():
 MAJOR = 0
-MINOR = 2
+MINOR = 3
 PATCH = 0
 SUFFIX = ''
 if PATCH != '':