From a81c9f9ec1926800f1995b60e36792c585733baf Mon Sep 17 00:00:00 2001 From: Xinlei Chen Date: Thu, 5 Aug 2021 11:59:29 -0700 Subject: [PATCH] update docs etc. --- README.md | 28 +++++++++++++++++++++++----- main_moco.py | 2 +- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 79de56e..415a546 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ This is the *default* setting for most hyper-parameters. With a batch size of 40 On the first node, run: ``` python main_moco.py \ + --moco-m-cos \ --dist-url 'tcp://[your node 1 address]:[specified port]'' \ --multiprocessing-distributed --world-size 2 --rank 0 \ [your imagenet-folder with train and val folders] @@ -100,7 +101,19 @@ python main_lincls.py \ ``` -### Reference Setups +### End-to-End Classification + +To perform end-to-end fine-tuning for ImageNet classification, first convert the pre-trained checkpoints to [DEiT](https://github.com/facebookresearch/deit) format: +``` +python convert_to_deit.py \ + --input [your checkpoint path]/[your checkpoint file].pth.tar \ + --output [target checkpoint file].pth +``` +Then use `[target checkpoint file].pth` to initialize weights in DEiT. + +With 100-epoch fine-tuning, the reference top-1 classification accuracy is 82.8%. With 300-epoch, the accuracy is 83.2%. + +### Reference Setups and Models For longer pre-trainings with ResNet-50, we find the following hyper-parameters work well (expected performance in the last column, will update logs/pre-trained models soon): @@ -111,27 +124,31 @@ For longer pre-trainings with ResNet-50, we find the following hyper-parameters learning
rate weight
decay momentum
update +momentum
schedule top-1 acc. 100 -0.45 +0.6 1e-6 0.99 -67.4 +cosine +69.0 300 0.3 1e-6 0.99 -[TODO]72.8 +cosine +73.0 1000 0.3 1.5e-6 0.996 +cosine [TODO]74.8 @@ -144,7 +161,8 @@ These hyper-parameters can be set with respective arguments. For example: On the first node, run: ``` python main_moco.py \ - --moco-m=0.996 --lr=.3 --wd=1.5e-6 --epochs=1000 \ + --lr=.3 --wd=1.5e-6 --epochs=1000 \ + --moco-m=0.996 --moco-m-cos \ --dist-url "tcp://[your node 1 address]:[specified port]" \ --multiprocessing-distributed --world-size 2 --rank 0 \ [your imagenet-folder with train and val folders] diff --git a/main_moco.py b/main_moco.py index 0b3d710..bb92b90 100755 --- a/main_moco.py +++ b/main_moco.py @@ -76,7 +76,7 @@ parser.add_argument('-b', '--batch-size', default=4096, type=int, help='mini-batch size (default: 4096), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') -parser.add_argument('--lr', '--learning-rate', default=0.45, type=float, +parser.add_argument('--lr', '--learning-rate', default=0.6, type=float, metavar='LR', help='initial (base) learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')