From 7074f47f0d1f59351c2f70a1ca697927c7bf9fb7 Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 24 Nov 2021 21:45:39 -0400 Subject: [PATCH 1/2] API: Change --batch-size description to match behaviour The batch size argument is the total across all GPUs on all nodes, not all GPUs on one node. --- main_moco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main_moco.py b/main_moco.py index f510ff4..97707c1 100755 --- a/main_moco.py +++ b/main_moco.py @@ -60,7 +60,7 @@ parser.add_argument('--start-epoch', default=0, type=int, metavar='N', parser.add_argument('-b', '--batch-size', default=4096, type=int, metavar='N', help='mini-batch size (default: 4096), this is the total ' - 'batch size of all GPUs on the current node when ' + 'batch size of all GPUs on all nodes when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.6, type=float, metavar='LR', help='initial (base) learning rate', dest='lr') From a29021ab3eaaf8b7a8d8b45d443f2008a7baa03d Mon Sep 17 00:00:00 2001 From: Scott Lowe Date: Wed, 24 Nov 2021 21:47:12 -0400 Subject: [PATCH 2/2] API: Change --batch-size argument to be across all nodes, not one Changed the behaviour of main_lincls.py to be consistent with the behaviour of main_moco.py. https://github.com/facebookresearch/moco-v3/blob/878544a/main_moco.py#L206 --- main_lincls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main_lincls.py b/main_lincls.py index 2fcfb4b..333bd4a 100755 --- a/main_lincls.py +++ b/main_lincls.py @@ -53,7 +53,7 @@ parser.add_argument('--start-epoch', default=0, type=int, metavar='N', parser.add_argument('-b', '--batch-size', default=1024, type=int, metavar='N', help='mini-batch size (default: 1024), this is the total ' - 'batch size of all GPUs on the current node when ' + 'batch size of all GPUs on all nodes when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial (base) learning rate', dest='lr') @@ -207,7 +207,7 @@ def main_worker(gpu, ngpus_per_node, args): # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have - args.batch_size = int(args.batch_size / ngpus_per_node) + args.batch_size = int(args.batch_size / args.world_size) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: