diff --git a/configs/conformer/conformer-base-p16_8xb128_in1k.py b/configs/conformer/conformer-base-p16_8xb128_in1k.py
index a44f56f3..00cac086 100644
--- a/configs/conformer/conformer-base-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-base-p16_8xb128_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 train_dataloader = dict(batch_size=128)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/conformer/conformer-small-p16_8xb128_in1k.py b/configs/conformer/conformer-small-p16_8xb128_in1k.py
index a937f4f9..d5d55d79 100644
--- a/configs/conformer/conformer-small-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p16_8xb128_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 train_dataloader = dict(batch_size=128)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/conformer/conformer-small-p32_8xb128_in1k.py b/configs/conformer/conformer-small-p32_8xb128_in1k.py
index 0b07ce2c..dcd4bbd6 100644
--- a/configs/conformer/conformer-small-p32_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p32_8xb128_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 train_dataloader = dict(batch_size=128)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/conformer/conformer-tiny-p16_8xb128_in1k.py b/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
index f88c6c3b..127a54dd 100644
--- a/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 train_dataloader = dict(batch_size=128)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/convmixer/convmixer-1024-20_10xb64_in1k.py b/configs/convmixer/convmixer-1024-20_10xb64_in1k.py
index dc5be7c5..5408e1f6 100644
--- a/configs/convmixer/convmixer-1024-20_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-1024-20_10xb64_in1k.py
@@ -32,3 +32,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=150)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
diff --git a/configs/convmixer/convmixer-1536-20_10xb64_in1k.py b/configs/convmixer/convmixer-1536-20_10xb64_in1k.py
index 1acb7be1..d011825c 100644
--- a/configs/convmixer/convmixer-1536-20_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-1536-20_10xb64_in1k.py
@@ -32,3 +32,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=150)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
diff --git a/configs/convmixer/convmixer-768-32_10xb64_in1k.py b/configs/convmixer/convmixer-768-32_10xb64_in1k.py
index e3f14a61..8e092c00 100644
--- a/configs/convmixer/convmixer-768-32_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-768-32_10xb64_in1k.py
@@ -12,3 +12,8 @@ optim_wrapper = dict(
 )
 
 train_cfg = dict(by_epoch=True, max_epochs=300)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
diff --git a/configs/convnext/convnext-base_32xb128_in1k.py b/configs/convnext/convnext-base_32xb128_in1k.py
index c8fa4ef7..e8fc11c6 100644
--- a/configs/convnext/convnext-base_32xb128_in1k.py
+++ b/configs/convnext/convnext-base_32xb128_in1k.py
@@ -16,3 +16,8 @@ optim_wrapper = dict(
 
 # runtime setting
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-large_64xb64_in1k.py b/configs/convnext/convnext-large_64xb64_in1k.py
index 6edc3a58..4dda425a 100644
--- a/configs/convnext/convnext-large_64xb64_in1k.py
+++ b/configs/convnext/convnext-large_64xb64_in1k.py
@@ -16,3 +16,8 @@ optim_wrapper = dict(
 
 # runtime setting
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-small_32xb128_in1k.py b/configs/convnext/convnext-small_32xb128_in1k.py
index 13304332..a7215bfa 100644
--- a/configs/convnext/convnext-small_32xb128_in1k.py
+++ b/configs/convnext/convnext-small_32xb128_in1k.py
@@ -16,3 +16,8 @@ optim_wrapper = dict(
 
 # runtime setting
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-tiny_32xb128_in1k.py b/configs/convnext/convnext-tiny_32xb128_in1k.py
index 5c09a279..d46c88a6 100644
--- a/configs/convnext/convnext-tiny_32xb128_in1k.py
+++ b/configs/convnext/convnext-tiny_32xb128_in1k.py
@@ -16,3 +16,8 @@ optim_wrapper = dict(
 
 # runtime setting
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-xlarge_64xb64_in1k.py b/configs/convnext/convnext-xlarge_64xb64_in1k.py
index e8f29739..da493332 100644
--- a/configs/convnext/convnext-xlarge_64xb64_in1k.py
+++ b/configs/convnext/convnext-xlarge_64xb64_in1k.py
@@ -16,3 +16,8 @@ optim_wrapper = dict(
 
 # runtime setting
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/cspnet/cspdarknet50_8xb32_in1k.py b/configs/cspnet/cspdarknet50_8xb32_in1k.py
index 4edc2531..8688eea0 100644
--- a/configs/cspnet/cspdarknet50_8xb32_in1k.py
+++ b/configs/cspnet/cspdarknet50_8xb32_in1k.py
@@ -43,3 +43,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/cspnet/cspresnet50_8xb32_in1k.py b/configs/cspnet/cspresnet50_8xb32_in1k.py
index b28c8fe6..8ba015c1 100644
--- a/configs/cspnet/cspresnet50_8xb32_in1k.py
+++ b/configs/cspnet/cspresnet50_8xb32_in1k.py
@@ -43,3 +43,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/cspnet/cspresnext50_8xb32_in1k.py b/configs/cspnet/cspresnext50_8xb32_in1k.py
index 5885bd98..64092086 100644
--- a/configs/cspnet/cspresnext50_8xb32_in1k.py
+++ b/configs/cspnet/cspresnext50_8xb32_in1k.py
@@ -43,3 +43,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py b/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py
index c8bdfb53..13a6df74 100644
--- a/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py
+++ b/configs/deit/deit-base-distilled_ft-16xb32_in1k-384px.py
@@ -7,3 +7,8 @@ model = dict(
     # Change to the path of the pretrained model
     # init_cfg=dict(type='Pretrained', checkpoint=''),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/deit/deit-base-distilled_pt-16xb64_in1k.py b/configs/deit/deit-base-distilled_pt-16xb64_in1k.py
index 039e53d9..818b41ca 100644
--- a/configs/deit/deit-base-distilled_pt-16xb64_in1k.py
+++ b/configs/deit/deit-base-distilled_pt-16xb64_in1k.py
@@ -8,3 +8,8 @@ model = dict(
 
 # dataset settings
 train_dataloader = dict(batch_size=64)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/deit/deit-base_ft-16xb32_in1k-384px.py b/configs/deit/deit-base_ft-16xb32_in1k-384px.py
index 9f53db2b..951a054a 100644
--- a/configs/deit/deit-base_ft-16xb32_in1k-384px.py
+++ b/configs/deit/deit-base_ft-16xb32_in1k-384px.py
@@ -30,3 +30,8 @@ train_dataloader = dict(batch_size=32)
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/deit/deit-base_pt-16xb64_in1k.py b/configs/deit/deit-base_pt-16xb64_in1k.py
index a4691a32..c7e30604 100644
--- a/configs/deit/deit-base_pt-16xb64_in1k.py
+++ b/configs/deit/deit-base_pt-16xb64_in1k.py
@@ -12,3 +12,8 @@ train_dataloader = dict(batch_size=64)
 
 # runtime settings
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/deit/deit-small-distilled_pt-4xb256_in1k.py b/configs/deit/deit-small-distilled_pt-4xb256_in1k.py
index 3b1fac22..9415d6d3 100644
--- a/configs/deit/deit-small-distilled_pt-4xb256_in1k.py
+++ b/configs/deit/deit-small-distilled_pt-4xb256_in1k.py
@@ -5,3 +5,8 @@ model = dict(
     backbone=dict(type='DistilledVisionTransformer', arch='deit-small'),
     head=dict(type='DeiTClsHead', in_channels=384),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/deit/deit-small_pt-4xb256_in1k.py b/configs/deit/deit-small_pt-4xb256_in1k.py
index e28d12f3..195e5d4e 100644
--- a/configs/deit/deit-small_pt-4xb256_in1k.py
+++ b/configs/deit/deit-small_pt-4xb256_in1k.py
@@ -46,3 +46,8 @@ optim_wrapper = dict(
         }),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py b/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py
index 175f9804..b365deb7 100644
--- a/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py
+++ b/configs/deit/deit-tiny-distilled_pt-4xb256_in1k.py
@@ -5,3 +5,8 @@ model = dict(
     backbone=dict(type='DistilledVisionTransformer', arch='deit-tiny'),
     head=dict(type='DeiTClsHead', in_channels=192),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/deit/deit-tiny_pt-4xb256_in1k.py b/configs/deit/deit-tiny_pt-4xb256_in1k.py
index 43df6e13..ffc5a01b 100644
--- a/configs/deit/deit-tiny_pt-4xb256_in1k.py
+++ b/configs/deit/deit-tiny_pt-4xb256_in1k.py
@@ -5,3 +5,8 @@ model = dict(
     backbone=dict(type='VisionTransformer', arch='deit-tiny'),
     head=dict(type='VisionTransformerClsHead', in_channels=192),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/densenet/densenet121_4xb256_in1k.py b/configs/densenet/densenet121_4xb256_in1k.py
index dc03defb..a888869e 100644
--- a/configs/densenet/densenet121_4xb256_in1k.py
+++ b/configs/densenet/densenet121_4xb256_in1k.py
@@ -10,3 +10,8 @@ train_dataloader = dict(batch_size=256)
 
 # schedule settings
 train_cfg = dict(by_epoch=True, max_epochs=90)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/densenet/densenet161_4xb256_in1k.py b/configs/densenet/densenet161_4xb256_in1k.py
index 96a14121..adbe6049 100644
--- a/configs/densenet/densenet161_4xb256_in1k.py
+++ b/configs/densenet/densenet161_4xb256_in1k.py
@@ -10,3 +10,8 @@ train_dataloader = dict(batch_size=256)
 
 # schedule settings
 train_cfg = dict(by_epoch=True, max_epochs=90)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/densenet/densenet169_4xb256_in1k.py b/configs/densenet/densenet169_4xb256_in1k.py
index 74b7b868..d4fc4d07 100644
--- a/configs/densenet/densenet169_4xb256_in1k.py
+++ b/configs/densenet/densenet169_4xb256_in1k.py
@@ -10,3 +10,8 @@ train_dataloader = dict(batch_size=256)
 
 # schedule settings
 train_cfg = dict(by_epoch=True, max_epochs=90)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/densenet/densenet201_4xb256_in1k.py b/configs/densenet/densenet201_4xb256_in1k.py
index 5ce4eed3..7acf02ce 100644
--- a/configs/densenet/densenet201_4xb256_in1k.py
+++ b/configs/densenet/densenet201_4xb256_in1k.py
@@ -10,3 +10,8 @@ train_dataloader = dict(batch_size=256)
 
 # schedule settings
 train_cfg = dict(by_epoch=True, max_epochs=90)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py
index 26f35917..efd1133b 100644
--- a/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b0_8xb32_in1k.py b/configs/efficientnet/efficientnet-b0_8xb32_in1k.py
index b88de4ee..86a3dae3 100644
--- a/configs/efficientnet/efficientnet-b0_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b0_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py
index db82ac95..34b6e99b 100644
--- a/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b1_8xb32_in1k.py b/configs/efficientnet/efficientnet-b1_8xb32_in1k.py
index 53651159..d026a315 100644
--- a/configs/efficientnet/efficientnet-b1_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b1_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py
index 5f6485ad..5013ad11 100644
--- a/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b2_8xb32_in1k.py b/configs/efficientnet/efficientnet-b2_8xb32_in1k.py
index ab389819..4092a799 100644
--- a/configs/efficientnet/efficientnet-b2_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b2_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py
index cee63852..27c258b9 100644
--- a/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b3_8xb32_in1k.py b/configs/efficientnet/efficientnet-b3_8xb32_in1k.py
index 55cad6ad..2fa86a03 100644
--- a/configs/efficientnet/efficientnet-b3_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b3_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py
index 7d7d9b18..bf84ce89 100644
--- a/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b4_8xb32_in1k.py b/configs/efficientnet/efficientnet-b4_8xb32_in1k.py
index 475daa4a..523afe12 100644
--- a/configs/efficientnet/efficientnet-b4_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b4_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py
index d2d90f10..424aa82b 100644
--- a/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b5_8xb32_in1k.py b/configs/efficientnet/efficientnet-b5_8xb32_in1k.py
index b548de37..95e2c9ac 100644
--- a/configs/efficientnet/efficientnet-b5_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b5_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py
index cea89508..f5be6afe 100644
--- a/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b6_8xb32_in1k.py b/configs/efficientnet/efficientnet-b6_8xb32_in1k.py
index eb9f9da6..0724160a 100644
--- a/configs/efficientnet/efficientnet-b6_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b6_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py
index 9ed4d7a5..c3f23c97 100644
--- a/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b7_8xb32_in1k.py b/configs/efficientnet/efficientnet-b7_8xb32_in1k.py
index 3f9c1fc2..31a220e3 100644
--- a/configs/efficientnet/efficientnet-b7_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b7_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py
index 79e34e8e..de2c297a 100644
--- a/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-b8_8xb32_in1k.py b/configs/efficientnet/efficientnet-b8_8xb32_in1k.py
index 81934303..570e7220 100644
--- a/configs/efficientnet/efficientnet-b8_8xb32_in1k.py
+++ b/configs/efficientnet/efficientnet-b8_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py
index a4b91426..301739a2 100644
--- a/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py
@@ -29,3 +29,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py
index be79f225..26112061 100644
--- a/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py
+++ b/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/hrnet/hrnet-w18_4xb32_in1k.py b/configs/hrnet/hrnet-w18_4xb32_in1k.py
index a84fe67f..80c96236 100644
--- a/configs/hrnet/hrnet-w18_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w18_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w30_4xb32_in1k.py b/configs/hrnet/hrnet-w30_4xb32_in1k.py
index d2a9c0dd..a115b9f6 100644
--- a/configs/hrnet/hrnet-w30_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w30_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w32_4xb32_in1k.py b/configs/hrnet/hrnet-w32_4xb32_in1k.py
index 91380a96..ae212efd 100644
--- a/configs/hrnet/hrnet-w32_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w32_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w40_4xb32_in1k.py b/configs/hrnet/hrnet-w40_4xb32_in1k.py
index 5d35cecd..3306d9fc 100644
--- a/configs/hrnet/hrnet-w40_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w40_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w44_4xb32_in1k.py b/configs/hrnet/hrnet-w44_4xb32_in1k.py
index ce6bb41a..bcbd8e72 100644
--- a/configs/hrnet/hrnet-w44_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w44_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w48_4xb32_in1k.py b/configs/hrnet/hrnet-w48_4xb32_in1k.py
index 6943892e..ca488d1b 100644
--- a/configs/hrnet/hrnet-w48_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w48_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/hrnet/hrnet-w64_4xb32_in1k.py b/configs/hrnet/hrnet-w64_4xb32_in1k.py
index 0009bc67..8ce649db 100644
--- a/configs/hrnet/hrnet-w64_4xb32_in1k.py
+++ b/configs/hrnet/hrnet-w64_4xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/inception_v3/inception-v3_8xb32_in1k.py b/configs/inception_v3/inception-v3_8xb32_in1k.py
index 061ea6e5..4a0a32da 100644
--- a/configs/inception_v3/inception-v3_8xb32_in1k.py
+++ b/configs/inception_v3/inception-v3_8xb32_in1k.py
@@ -22,3 +22,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/lenet/lenet5_mnist.py b/configs/lenet/lenet5_mnist.py
index 78f2ada8..c54bb435 100644
--- a/configs/lenet/lenet5_mnist.py
+++ b/configs/lenet/lenet5_mnist.py
@@ -84,3 +84,8 @@ load_from = None
 
 # whether to resume the training of the checkpoint
 resume_from = None
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py b/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py
index bbf4268d..0c6e10a9 100644
--- a/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py
+++ b/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py b/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py
index 4fbe9c5c..60124e19 100644
--- a/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py
+++ b/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py
@@ -6,3 +6,8 @@ _base_ = [
 ]
 
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py b/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
index afd2d979..01997276 100644
--- a/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
+++ b/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_epochstep.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py b/configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py
index 23a329c2..00854327 100644
--- a/configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py
+++ b/configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py
@@ -21,3 +21,8 @@ param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
 train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py b/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py
index 71c72224..6b491b9d 100644
--- a/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py
+++ b/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py
@@ -13,3 +13,8 @@ param_scheduler = dict(
 )
 
 train_cfg = dict(by_epoch=True, max_epochs=200)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/mobilenet_v3/mobilenet-v3-small_8xb32_in1k.py b/configs/mobilenet_v3/mobilenet-v3-small_8xb32_in1k.py
index b724a610..549fafea 100644
--- a/configs/mobilenet_v3/mobilenet-v3-small_8xb32_in1k.py
+++ b/configs/mobilenet_v3/mobilenet-v3-small_8xb32_in1k.py
@@ -21,3 +21,8 @@ param_scheduler = dict(type='StepLR', by_epoch=True, step_size=2, gamma=0.973)
 train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/poolformer/poolformer-m36_32xb128_in1k.py b/configs/poolformer/poolformer-m36_32xb128_in1k.py
index 7b71d1f3..ae06c3a0 100644
--- a/configs/poolformer/poolformer-m36_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-m36_32xb128_in1k.py
@@ -10,3 +10,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/poolformer/poolformer-m48_32xb128_in1k.py b/configs/poolformer/poolformer-m48_32xb128_in1k.py
index 832102f6..8c3cb491 100644
--- a/configs/poolformer/poolformer-m48_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-m48_32xb128_in1k.py
@@ -10,3 +10,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/poolformer/poolformer-s12_32xb128_in1k.py b/configs/poolformer/poolformer-s12_32xb128_in1k.py
index b346ad8b..ea174e7d 100644
--- a/configs/poolformer/poolformer-s12_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s12_32xb128_in1k.py
@@ -10,3 +10,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/poolformer/poolformer-s24_32xb128_in1k.py b/configs/poolformer/poolformer-s24_32xb128_in1k.py
index c0b17f30..fbd69bed 100644
--- a/configs/poolformer/poolformer-s24_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s24_32xb128_in1k.py
@@ -10,3 +10,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/poolformer/poolformer-s36_32xb128_in1k.py b/configs/poolformer/poolformer-s36_32xb128_in1k.py
index f3487d13..2d5e30d2 100644
--- a/configs/poolformer/poolformer-s36_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s36_32xb128_in1k.py
@@ -10,3 +10,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
     clip_grad=dict(max_norm=5.0),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/regnet/regnetx-1.6gf_8xb128_in1k.py b/configs/regnet/regnetx-1.6gf_8xb128_in1k.py
index d3e9e934..3c11d071 100644
--- a/configs/regnet/regnetx-1.6gf_8xb128_in1k.py
+++ b/configs/regnet/regnetx-1.6gf_8xb128_in1k.py
@@ -4,3 +4,8 @@ _base_ = ['./regnetx-400mf_8xb128_in1k.py']
 model = dict(
     backbone=dict(type='RegNet', arch='regnetx_1.6gf'),
     head=dict(in_channels=912, ))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/regnet/regnetx-12gf_8xb64_in1k.py b/configs/regnet/regnetx-12gf_8xb64_in1k.py
index 2c84750f..480d8f1e 100644
--- a/configs/regnet/regnetx-12gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-12gf_8xb64_in1k.py
@@ -11,3 +11,8 @@ train_dataloader = dict(batch_size=64)
 # schedule settings
 # for batch_size 512, use lr = 0.4
 optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/regnet/regnetx-3.2gf_8xb64_in1k.py b/configs/regnet/regnetx-3.2gf_8xb64_in1k.py
index 89101e14..713621b5 100644
--- a/configs/regnet/regnetx-3.2gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-3.2gf_8xb64_in1k.py
@@ -11,3 +11,8 @@ train_dataloader = dict(batch_size=64)
 # schedule settings
 # for batch_size 512, use lr = 0.4
 optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/regnet/regnetx-4.0gf_8xb64_in1k.py b/configs/regnet/regnetx-4.0gf_8xb64_in1k.py
index cece654b..2ad6504d 100644
--- a/configs/regnet/regnetx-4.0gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-4.0gf_8xb64_in1k.py
@@ -11,3 +11,8 @@ train_dataloader = dict(batch_size=64)
 # schedule settings
 # for batch_size 512, use lr = 0.4
 optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/regnet/regnetx-400mf_8xb128_in1k.py b/configs/regnet/regnetx-400mf_8xb128_in1k.py
index 9f272381..ddbc660d 100644
--- a/configs/regnet/regnetx-400mf_8xb128_in1k.py
+++ b/configs/regnet/regnetx-400mf_8xb128_in1k.py
@@ -53,3 +53,8 @@ custom_hooks = [
         interval=1,
         priority='ABOVE_NORMAL')
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/regnet/regnetx-6.4gf_8xb64_in1k.py b/configs/regnet/regnetx-6.4gf_8xb64_in1k.py
index 488fc40e..80f7914c 100644
--- a/configs/regnet/regnetx-6.4gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-6.4gf_8xb64_in1k.py
@@ -11,3 +11,8 @@ train_dataloader = dict(batch_size=64)
 # schedule settings
 # for batch_size 512, use lr = 0.4
 optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/regnet/regnetx-8.0gf_8xb64_in1k.py b/configs/regnet/regnetx-8.0gf_8xb64_in1k.py
index 98ca8954..c7e4117b 100644
--- a/configs/regnet/regnetx-8.0gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-8.0gf_8xb64_in1k.py
@@ -11,3 +11,8 @@ train_dataloader = dict(batch_size=64)
 # schedule settings
 # for batch_size 512, use lr = 0.4
 optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/regnet/regnetx-800mf_8xb128_in1k.py b/configs/regnet/regnetx-800mf_8xb128_in1k.py
index 9cd71379..6931b155 100644
--- a/configs/regnet/regnetx-800mf_8xb128_in1k.py
+++ b/configs/regnet/regnetx-800mf_8xb128_in1k.py
@@ -4,3 +4,8 @@ _base_ = ['./regnetx-400mf_8xb128_in1k.py']
 model = dict(
     backbone=dict(type='RegNet', arch='regnetx_800mf'),
     head=dict(in_channels=672, ))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/repmlp/repmlp-base_8xb64_in1k-256px.py b/configs/repmlp/repmlp-base_8xb64_in1k-256px.py
index 86f1edc3..abba9bb0 100644
--- a/configs/repmlp/repmlp-base_8xb64_in1k-256px.py
+++ b/configs/repmlp/repmlp-base_8xb64_in1k-256px.py
@@ -29,3 +29,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/repmlp/repmlp-base_8xb64_in1k.py b/configs/repmlp/repmlp-base_8xb64_in1k.py
index ad6e791a..a55d19ea 100644
--- a/configs/repmlp/repmlp-base_8xb64_in1k.py
+++ b/configs/repmlp/repmlp-base_8xb64_in1k.py
@@ -19,3 +19,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/repmlp/repmlp-base_delopy_8xb64_in1k.py b/configs/repmlp/repmlp-base_delopy_8xb64_in1k.py
index b5b2c882..01a1ed3e 100644
--- a/configs/repmlp/repmlp-base_delopy_8xb64_in1k.py
+++ b/configs/repmlp/repmlp-base_delopy_8xb64_in1k.py
@@ -1,3 +1,8 @@
 _base_ = ['./repmlp-base_8xb64_in1k.py']
 
 model = dict(backbone=dict(deploy=True))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/repmlp/repmlp-base_deploy_8xb64_in1k-256px.py b/configs/repmlp/repmlp-base_deploy_8xb64_in1k-256px.py
index 27ff50a0..1ce9c238 100644
--- a/configs/repmlp/repmlp-base_deploy_8xb64_in1k-256px.py
+++ b/configs/repmlp/repmlp-base_deploy_8xb64_in1k-256px.py
@@ -1,3 +1,8 @@
 _base_ = ['./repmlp-base_8xb64_in1k-256px.py']
 
 model = dict(backbone=dict(deploy=True))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
index 8a93ed0a..ca4cb757 100644
--- a/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
@@ -10,3 +10,8 @@ param_scheduler = dict(
     type='CosineAnnealingLR', T_max=120, by_epoch=True, begin=0, end=120)
 
 train_cfg = dict(by_epoch=True, max_epochs=120)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
index 649020f2..5eb1c8bf 100644
--- a/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='A1'))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
index eedaf2d2..2f955dad 100644
--- a/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='A2'), head=dict(in_channels=1408))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
index b3ce7ea2..209f3ad3 100644
--- a/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='B0'), head=dict(in_channels=1280))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
index 30adea3d..3270d19b 100644
--- a/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='B1'), head=dict(in_channels=2048))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
index 2749db8d..10a6847d 100644
--- a/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='B1g2'), head=dict(in_channels=2048))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
index 26476909..cef5d8a0 100644
--- a/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='B1g4'), head=dict(in_channels=2048))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
index 4d215567..2800e320 100644
--- a/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
 
 model = dict(backbone=dict(arch='B2'), head=dict(in_channels=2560))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
index 11331cf0..993d97a2 100644
--- a/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ b/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
 
 model = dict(backbone=dict(arch='B2g4'))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
index 97334aff..8b7ed746 100644
--- a/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ b/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
@@ -37,3 +37,8 @@ test_pipeline = [
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
index 67e3688c..467cd9f4 100644
--- a/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ b/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
 
 model = dict(backbone=dict(arch='B3g4'))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
index d235610f..fadc0803 100644
--- a/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ b/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
@@ -1,3 +1,8 @@
 _base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
 
 model = dict(backbone=dict(arch='D2se'))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (4 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/res2net/res2net101-w26-s4_8xb32_in1k.py b/configs/res2net/res2net101-w26-s4_8xb32_in1k.py
index 7ebe9e94..a3708f21 100644
--- a/configs/res2net/res2net101-w26-s4_8xb32_in1k.py
+++ b/configs/res2net/res2net101-w26-s4_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/res2net/res2net50-w14-s8_8xb32_in1k.py b/configs/res2net/res2net50-w14-s8_8xb32_in1k.py
index 56cc02e3..4aa8241d 100644
--- a/configs/res2net/res2net50-w14-s8_8xb32_in1k.py
+++ b/configs/res2net/res2net50-w14-s8_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/res2net/res2net50-w26-s8_8xb32_in1k.py b/configs/res2net/res2net50-w26-s8_8xb32_in1k.py
index d7dcbeb9..18896bb6 100644
--- a/configs/res2net/res2net50-w26-s8_8xb32_in1k.py
+++ b/configs/res2net/res2net50-w26-s8_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnest/resnest101_32xb64_in1k.py b/configs/resnest/resnest101_32xb64_in1k.py
index a2a6ca95..7b813f76 100644
--- a/configs/resnest/resnest101_32xb64_in1k.py
+++ b/configs/resnest/resnest101_32xb64_in1k.py
@@ -71,3 +71,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=270)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnest/resnest200_64xb32_in1k.py b/configs/resnest/resnest200_64xb32_in1k.py
index 65fc3d9f..334710e5 100644
--- a/configs/resnest/resnest200_64xb32_in1k.py
+++ b/configs/resnest/resnest200_64xb32_in1k.py
@@ -67,3 +67,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=270)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnest/resnest269_64xb32_in1k.py b/configs/resnest/resnest269_64xb32_in1k.py
index 08e85315..d4ef4d1f 100644
--- a/configs/resnest/resnest269_64xb32_in1k.py
+++ b/configs/resnest/resnest269_64xb32_in1k.py
@@ -71,3 +71,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=270)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnest/resnest50_32xb64_in1k.py b/configs/resnest/resnest50_32xb64_in1k.py
index e8096517..82675003 100644
--- a/configs/resnest/resnest50_32xb64_in1k.py
+++ b/configs/resnest/resnest50_32xb64_in1k.py
@@ -71,3 +71,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=270)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet101_8xb16_cifar10.py b/configs/resnet/resnet101_8xb16_cifar10.py
index 166a1740..5d57c0e2 100644
--- a/configs/resnet/resnet101_8xb16_cifar10.py
+++ b/configs/resnet/resnet101_8xb16_cifar10.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet101_8xb32_in1k.py b/configs/resnet/resnet101_8xb32_in1k.py
index 388d2cd9..812134f6 100644
--- a/configs/resnet/resnet101_8xb32_in1k.py
+++ b/configs/resnet/resnet101_8xb32_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet101.py', '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet152_8xb16_cifar10.py b/configs/resnet/resnet152_8xb16_cifar10.py
index 3f307b6a..12a0a5e5 100644
--- a/configs/resnet/resnet152_8xb16_cifar10.py
+++ b/configs/resnet/resnet152_8xb16_cifar10.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet152_8xb32_in1k.py b/configs/resnet/resnet152_8xb32_in1k.py
index cc9dc2ce..46d39fc4 100644
--- a/configs/resnet/resnet152_8xb32_in1k.py
+++ b/configs/resnet/resnet152_8xb32_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet152.py', '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet18_8xb16_cifar10.py b/configs/resnet/resnet18_8xb16_cifar10.py
index c7afa397..ae0264e1 100644
--- a/configs/resnet/resnet18_8xb16_cifar10.py
+++ b/configs/resnet/resnet18_8xb16_cifar10.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet18_cifar.py', '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet18_8xb32_in1k.py b/configs/resnet/resnet18_8xb32_in1k.py
index ac452ff7..383638f1 100644
--- a/configs/resnet/resnet18_8xb32_in1k.py
+++ b/configs/resnet/resnet18_8xb32_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet18.py', '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet34_8xb16_cifar10.py b/configs/resnet/resnet34_8xb16_cifar10.py
index 7f5cd517..b1f8b11c 100644
--- a/configs/resnet/resnet34_8xb16_cifar10.py
+++ b/configs/resnet/resnet34_8xb16_cifar10.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet34_cifar.py', '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet34_8xb32_in1k.py b/configs/resnet/resnet34_8xb32_in1k.py
index 7749261c..0634c555 100644
--- a/configs/resnet/resnet34_8xb32_in1k.py
+++ b/configs/resnet/resnet34_8xb32_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet34.py', '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_32xb64-warmup-coslr_in1k.py b/configs/resnet/resnet50_32xb64-warmup-coslr_in1k.py
index c26245ef..12c07fcc 100644
--- a/configs/resnet/resnet50_32xb64-warmup-coslr_in1k.py
+++ b/configs/resnet/resnet50_32xb64-warmup-coslr_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs2048_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_32xb64-warmup-lbs_in1k.py b/configs/resnet/resnet50_32xb64-warmup-lbs_in1k.py
index 2f24f9a0..24168cc5 100644
--- a/configs/resnet/resnet50_32xb64-warmup-lbs_in1k.py
+++ b/configs/resnet/resnet50_32xb64-warmup-lbs_in1k.py
@@ -10,3 +10,8 @@ model = dict(
             label_smooth_val=0.1,
             num_classes=1000),
     ))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_32xb64-warmup_in1k.py b/configs/resnet/resnet50_32xb64-warmup_in1k.py
index 34d5288b..6af884d7 100644
--- a/configs/resnet/resnet50_32xb64-warmup_in1k.py
+++ b/configs/resnet/resnet50_32xb64-warmup_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet50.py', '../_base_/datasets/imagenet_bs64.py',
     '../_base_/schedules/imagenet_bs2048.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_8xb128_coslr-90e_in21k.py b/configs/resnet/resnet50_8xb128_coslr-90e_in21k.py
index d2cc1ee2..ff4c1511 100644
--- a/configs/resnet/resnet50_8xb128_coslr-90e_in21k.py
+++ b/configs/resnet/resnet50_8xb128_coslr-90e_in21k.py
@@ -9,3 +9,8 @@ model = dict(head=dict(num_classes=21843))
 
 # runtime settings
 train_cfg = dict(by_epoch=True, max_epochs=90)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/resnet/resnet50_8xb16-mixup_cifar10.py b/configs/resnet/resnet50_8xb16-mixup_cifar10.py
index 2420ebfe..e5b480fc 100644
--- a/configs/resnet/resnet50_8xb16-mixup_cifar10.py
+++ b/configs/resnet/resnet50_8xb16-mixup_cifar10.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet50_8xb16_cifar10.py b/configs/resnet/resnet50_8xb16_cifar10.py
index 669e5de2..e1766c43 100644
--- a/configs/resnet/resnet50_8xb16_cifar10.py
+++ b/configs/resnet/resnet50_8xb16_cifar10.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet50_cifar.py', '../_base_/datasets/cifar10_bs16.py',
     '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet50_8xb16_cifar100.py b/configs/resnet/resnet50_8xb16_cifar100.py
index ebde6c76..7c740b3c 100644
--- a/configs/resnet/resnet50_8xb16_cifar100.py
+++ b/configs/resnet/resnet50_8xb16_cifar100.py
@@ -17,3 +17,8 @@ param_scheduler = dict(
     milestones=[60, 120, 160],
     gamma=0.2,
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py b/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py
index 3d2d5894..e0c90293 100644
--- a/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py
+++ b/configs/resnet/resnet50_8xb256-rsb-a1-600e_in1k.py
@@ -53,3 +53,8 @@ param_scheduler = [
 ]
 
 train_cfg = dict(by_epoch=True, max_epochs=600)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_8xb256-rsb-a2-300e_in1k.py b/configs/resnet/resnet50_8xb256-rsb-a2-300e_in1k.py
index a8e93003..f4330385 100644
--- a/configs/resnet/resnet50_8xb256-rsb-a2-300e_in1k.py
+++ b/configs/resnet/resnet50_8xb256-rsb-a2-300e_in1k.py
@@ -44,3 +44,8 @@ param_scheduler = [
         end=300)
 ]
 train_cfg = dict(by_epoch=True, max_epochs=300)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py b/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py
index e6872a3b..00c44b8c 100644
--- a/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py
+++ b/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py
@@ -20,3 +20,8 @@ optim_wrapper = dict(
     optimizer=dict(lr=0.008),
     paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
 )
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (256 samples per GPU)
+auto_scale_lr = dict(base_batch_size=2048)
diff --git a/configs/resnet/resnet50_8xb32-coslr-preciseBN_in1k.py b/configs/resnet/resnet50_8xb32-coslr-preciseBN_in1k.py
index dab82c6e..58c20def 100644
--- a/configs/resnet/resnet50_8xb32-coslr-preciseBN_in1k.py
+++ b/configs/resnet/resnet50_8xb32-coslr-preciseBN_in1k.py
@@ -10,3 +10,8 @@ custom_hooks = [
         interval=1,
         priority='ABOVE_NORMAL')
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-coslr_in1k.py b/configs/resnet/resnet50_8xb32-coslr_in1k.py
index 938a114b..45bcdc68 100644
--- a/configs/resnet/resnet50_8xb32-coslr_in1k.py
+++ b/configs/resnet/resnet50_8xb32-coslr_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-cutmix_in1k.py b/configs/resnet/resnet50_8xb32-cutmix_in1k.py
index 2f8d0ca9..1db9ecb6 100644
--- a/configs/resnet/resnet50_8xb32-cutmix_in1k.py
+++ b/configs/resnet/resnet50_8xb32-cutmix_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-fp16-dynamic_in1k.py b/configs/resnet/resnet50_8xb32-fp16-dynamic_in1k.py
index 58f6fe4c..ce90a63a 100644
--- a/configs/resnet/resnet50_8xb32-fp16-dynamic_in1k.py
+++ b/configs/resnet/resnet50_8xb32-fp16-dynamic_in1k.py
@@ -2,3 +2,8 @@ _base_ = ['./resnet50_8xb32_in1k.py']
 
 # schedule settings
 optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-fp16_in1k.py b/configs/resnet/resnet50_8xb32-fp16_in1k.py
index 19ee6ee4..592e37ef 100644
--- a/configs/resnet/resnet50_8xb32-fp16_in1k.py
+++ b/configs/resnet/resnet50_8xb32-fp16_in1k.py
@@ -2,3 +2,8 @@ _base_ = ['./resnet50_8xb32_in1k.py']
 
 # schedule settings
 optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=512.)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-lbs_in1k.py b/configs/resnet/resnet50_8xb32-lbs_in1k.py
index 1c1aa5a2..fef90281 100644
--- a/configs/resnet/resnet50_8xb32-lbs_in1k.py
+++ b/configs/resnet/resnet50_8xb32-lbs_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32-mixup_in1k.py b/configs/resnet/resnet50_8xb32-mixup_in1k.py
index 2a153d0e..b2f3e141 100644
--- a/configs/resnet/resnet50_8xb32-mixup_in1k.py
+++ b/configs/resnet/resnet50_8xb32-mixup_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb32_in1k.py b/configs/resnet/resnet50_8xb32_in1k.py
index c32f333b..c01f2655 100644
--- a/configs/resnet/resnet50_8xb32_in1k.py
+++ b/configs/resnet/resnet50_8xb32_in1k.py
@@ -2,3 +2,8 @@ _base_ = [
     '../_base_/models/resnet50.py', '../_base_/datasets/imagenet_bs32.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnet50_8xb8_cub.py b/configs/resnet/resnet50_8xb8_cub.py
index 17054ef5..02f1ff5b 100644
--- a/configs/resnet/resnet50_8xb8_cub.py
+++ b/configs/resnet/resnet50_8xb8_cub.py
@@ -18,3 +18,8 @@ model = dict(
 
 # runtime settings
 default_hooks = dict(logger=dict(type='LoggerHook', interval=20))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/resnet/resnetv1c101_8xb32_in1k.py b/configs/resnet/resnetv1c101_8xb32_in1k.py
index 441aff59..d8cf28bc 100644
--- a/configs/resnet/resnetv1c101_8xb32_in1k.py
+++ b/configs/resnet/resnetv1c101_8xb32_in1k.py
@@ -5,3 +5,8 @@ _base_ = [
 ]
 
 model = dict(backbone=dict(depth=101))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnetv1c152_8xb32_in1k.py b/configs/resnet/resnetv1c152_8xb32_in1k.py
index b9f466f8..e16345b2 100644
--- a/configs/resnet/resnetv1c152_8xb32_in1k.py
+++ b/configs/resnet/resnetv1c152_8xb32_in1k.py
@@ -5,3 +5,8 @@ _base_ = [
 ]
 
 model = dict(backbone=dict(depth=152))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnetv1c50_8xb32_in1k.py b/configs/resnet/resnetv1c50_8xb32_in1k.py
index aa1c8b64..06e0c613 100644
--- a/configs/resnet/resnetv1c50_8xb32_in1k.py
+++ b/configs/resnet/resnetv1c50_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnetv1d101_8xb32_in1k.py b/configs/resnet/resnetv1d101_8xb32_in1k.py
index b16ca863..307a7700 100644
--- a/configs/resnet/resnetv1d101_8xb32_in1k.py
+++ b/configs/resnet/resnetv1d101_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnetv1d152_8xb32_in1k.py b/configs/resnet/resnetv1d152_8xb32_in1k.py
index 76926ddb..1de344e6 100644
--- a/configs/resnet/resnetv1d152_8xb32_in1k.py
+++ b/configs/resnet/resnetv1d152_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnet/resnetv1d50_8xb32_in1k.py b/configs/resnet/resnetv1d50_8xb32_in1k.py
index 208bde47..cb672555 100644
--- a/configs/resnet/resnetv1d50_8xb32_in1k.py
+++ b/configs/resnet/resnetv1d50_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnext/resnext101-32x4d_8xb32_in1k.py b/configs/resnext/resnext101-32x4d_8xb32_in1k.py
index 970aa60f..c8691cae 100644
--- a/configs/resnext/resnext101-32x4d_8xb32_in1k.py
+++ b/configs/resnext/resnext101-32x4d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnext/resnext101-32x8d_8xb32_in1k.py b/configs/resnext/resnext101-32x8d_8xb32_in1k.py
index 315d05fd..45bf4f79 100644
--- a/configs/resnext/resnext101-32x8d_8xb32_in1k.py
+++ b/configs/resnext/resnext101-32x8d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnext/resnext152-32x4d_8xb32_in1k.py b/configs/resnext/resnext152-32x4d_8xb32_in1k.py
index 9c137313..918a7dee 100644
--- a/configs/resnext/resnext152-32x4d_8xb32_in1k.py
+++ b/configs/resnext/resnext152-32x4d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/resnext/resnext50-32x4d_8xb32_in1k.py b/configs/resnext/resnext50-32x4d_8xb32_in1k.py
index bd9c9fcf..298cd320 100644
--- a/configs/resnext/resnext50-32x4d_8xb32_in1k.py
+++ b/configs/resnext/resnext50-32x4d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/seresnet/seresnet101_8xb32_in1k.py b/configs/seresnet/seresnet101_8xb32_in1k.py
index 8be39e7a..f2f99336 100644
--- a/configs/seresnet/seresnet101_8xb32_in1k.py
+++ b/configs/seresnet/seresnet101_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/seresnet/seresnet50_8xb32_in1k.py b/configs/seresnet/seresnet50_8xb32_in1k.py
index 19082bd0..7159f617 100644
--- a/configs/seresnet/seresnet50_8xb32_in1k.py
+++ b/configs/seresnet/seresnet50_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256_140e.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/seresnet/seresnext101-32x4d_8xb32_in1k.py b/configs/seresnet/seresnext101-32x4d_8xb32_in1k.py
index 01778305..b89f464a 100644
--- a/configs/seresnet/seresnext101-32x4d_8xb32_in1k.py
+++ b/configs/seresnet/seresnext101-32x4d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/seresnet/seresnext50-32x4d_8xb32_in1k.py b/configs/seresnet/seresnext50-32x4d_8xb32_in1k.py
index 4d593e45..73ebbdf3 100644
--- a/configs/seresnet/seresnext50-32x4d_8xb32_in1k.py
+++ b/configs/seresnet/seresnext50-32x4d_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py b/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py
index 58e45f1b..a6ada1e0 100644
--- a/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py
+++ b/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py b/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py
index a106ab86..99d81e38 100644
--- a/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py
+++ b/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs1024_linearlr_bn_nowd.py',
     '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-base_16xb64_in1k-384px.py b/configs/swin_transformer/swin-base_16xb64_in1k-384px.py
index 10f89921..1df65856 100644
--- a/configs/swin_transformer/swin-base_16xb64_in1k-384px.py
+++ b/configs/swin_transformer/swin-base_16xb64_in1k-384px.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-base_16xb64_in1k.py b/configs/swin_transformer/swin-base_16xb64_in1k.py
index 05a95b44..a0a566f5 100644
--- a/configs/swin_transformer/swin-base_16xb64_in1k.py
+++ b/configs/swin_transformer/swin-base_16xb64_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-large_16xb64_in1k-384px.py b/configs/swin_transformer/swin-large_16xb64_in1k-384px.py
index 5ba52b35..fd9c22ac 100644
--- a/configs/swin_transformer/swin-large_16xb64_in1k-384px.py
+++ b/configs/swin_transformer/swin-large_16xb64_in1k-384px.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-large_16xb64_in1k.py b/configs/swin_transformer/swin-large_16xb64_in1k.py
index 36121efc..db25a000 100644
--- a/configs/swin_transformer/swin-large_16xb64_in1k.py
+++ b/configs/swin_transformer/swin-large_16xb64_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-large_8xb8_cub_384px.py b/configs/swin_transformer/swin-large_8xb8_cub_384px.py
index 7af5b53b..0f957315 100644
--- a/configs/swin_transformer/swin-large_8xb8_cub_384px.py
+++ b/configs/swin_transformer/swin-large_8xb8_cub_384px.py
@@ -37,3 +37,8 @@ default_hooks = dict(
     logger=dict(type='LoggerHook', interval=20),
     # save last three checkpoints
     checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/swin_transformer/swin-small_16xb64_in1k.py b/configs/swin_transformer/swin-small_16xb64_in1k.py
index 7c1a8e21..9bbb3fef 100644
--- a/configs/swin_transformer/swin-small_16xb64_in1k.py
+++ b/configs/swin_transformer/swin-small_16xb64_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/swin_transformer/swin-tiny_16xb64_in1k.py b/configs/swin_transformer/swin-tiny_16xb64_in1k.py
index 9a1ce250..bb9646c9 100644
--- a/configs/swin_transformer/swin-tiny_16xb64_in1k.py
+++ b/configs/swin_transformer/swin-tiny_16xb64_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py b/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
index 193b7775..6d6c5b5b 100644
--- a/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
@@ -42,3 +42,8 @@ test_cfg = dict()
 
 # runtime settings
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py b/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
index 8fce1f3a..dd8b4cf7 100644
--- a/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
@@ -42,3 +42,8 @@ test_cfg = dict()
 
 # runtime settings
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py b/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
index c024b4a1..630c752b 100644
--- a/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
@@ -42,3 +42,8 @@ test_cfg = dict()
 
 # runtime settings
 custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
diff --git a/configs/tnt/tnt-s-p16_16xb64_in1k.py b/configs/tnt/tnt-s-p16_16xb64_in1k.py
index 50412868..0a7518a6 100644
--- a/configs/tnt/tnt-s-p16_16xb64_in1k.py
+++ b/configs/tnt/tnt-s-p16_16xb64_in1k.py
@@ -49,3 +49,8 @@ param_scheduler = [
 train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-pcpvt-base_8xb128_in1k.py b/configs/twins/twins-pcpvt-base_8xb128_in1k.py
index 3ac5d2ad..a52a0ea2 100644
--- a/configs/twins/twins-pcpvt-base_8xb128_in1k.py
+++ b/configs/twins/twins-pcpvt-base_8xb128_in1k.py
@@ -39,3 +39,8 @@ param_scheduler = [
         begin=5,
         end=300)
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-pcpvt-large_16xb64_in1k.py b/configs/twins/twins-pcpvt-large_16xb64_in1k.py
index b5ad5472..c136c02c 100644
--- a/configs/twins/twins-pcpvt-large_16xb64_in1k.py
+++ b/configs/twins/twins-pcpvt-large_16xb64_in1k.py
@@ -5,3 +5,8 @@ model = dict(backbone=dict(arch='large'), head=dict(in_channels=512))
 
 # dataset settings
 train_dataloader = dict(batch_size=128)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-pcpvt-small_8xb128_in1k.py b/configs/twins/twins-pcpvt-small_8xb128_in1k.py
index 9fe763b7..5530e397 100644
--- a/configs/twins/twins-pcpvt-small_8xb128_in1k.py
+++ b/configs/twins/twins-pcpvt-small_8xb128_in1k.py
@@ -2,3 +2,8 @@ _base_ = ['twins-pcpvt-base_8xb128_in1k.py']
 
 # model settings
 model = dict(backbone=dict(arch='small'), head=dict(in_channels=512))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-svt-base_8xb128_in1k.py b/configs/twins/twins-svt-base_8xb128_in1k.py
index 1d24f63b..0f00981f 100644
--- a/configs/twins/twins-svt-base_8xb128_in1k.py
+++ b/configs/twins/twins-svt-base_8xb128_in1k.py
@@ -39,3 +39,8 @@ param_scheduler = [
         begin=5,
         end=300)
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-svt-large_16xb64_in1k.py b/configs/twins/twins-svt-large_16xb64_in1k.py
index e8a1eba8..e32dbe11 100644
--- a/configs/twins/twins-svt-large_16xb64_in1k.py
+++ b/configs/twins/twins-svt-large_16xb64_in1k.py
@@ -5,3 +5,8 @@ model = dict(backbone=dict(arch='large'), head=dict(in_channels=1024))
 
 # dataset settings
 train_dataloader = dict(batch_size=64)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/twins/twins-svt-small_8xb128_in1k.py b/configs/twins/twins-svt-small_8xb128_in1k.py
index 2ffe267b..97c5a945 100644
--- a/configs/twins/twins-svt-small_8xb128_in1k.py
+++ b/configs/twins/twins-svt-small_8xb128_in1k.py
@@ -2,3 +2,8 @@ _base_ = ['twins-svt-base_8xb128_in1k.py']
 
 # model settings
 model = dict(backbone=dict(arch='small'), head=dict(in_channels=512))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/van/van-base_8xb128_in1k.py b/configs/van/van-base_8xb128_in1k.py
index 3099dc37..4dd53c58 100644
--- a/configs/van/van-base_8xb128_in1k.py
+++ b/configs/van/van-base_8xb128_in1k.py
@@ -63,3 +63,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/van/van-large_8xb128_in1k.py b/configs/van/van-large_8xb128_in1k.py
index a843b732..7341329b 100644
--- a/configs/van/van-large_8xb128_in1k.py
+++ b/configs/van/van-large_8xb128_in1k.py
@@ -63,3 +63,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/van/van-small_8xb128_in1k.py b/configs/van/van-small_8xb128_in1k.py
index 04322ae7..29e14e34 100644
--- a/configs/van/van-small_8xb128_in1k.py
+++ b/configs/van/van-small_8xb128_in1k.py
@@ -63,3 +63,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/van/van-tiny_8xb128_in1k.py b/configs/van/van-tiny_8xb128_in1k.py
index 1d2d799e..f1eb385b 100644
--- a/configs/van/van-tiny_8xb128_in1k.py
+++ b/configs/van/van-tiny_8xb128_in1k.py
@@ -63,3 +63,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule settings
 optim_wrapper = dict(clip_grad=dict(max_norm=5.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/vgg/vgg11_8xb32_in1k.py b/configs/vgg/vgg11_8xb32_in1k.py
index 616233c4..08e2c9d9 100644
--- a/configs/vgg/vgg11_8xb32_in1k.py
+++ b/configs/vgg/vgg11_8xb32_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg11bn_8xb32_in1k.py b/configs/vgg/vgg11bn_8xb32_in1k.py
index 22f55ef0..dec892c8 100644
--- a/configs/vgg/vgg11bn_8xb32_in1k.py
+++ b/configs/vgg/vgg11bn_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256.py',
     '../_base_/default_runtime.py',
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg13_8xb32_in1k.py b/configs/vgg/vgg13_8xb32_in1k.py
index ec1c98fb..8602b255 100644
--- a/configs/vgg/vgg13_8xb32_in1k.py
+++ b/configs/vgg/vgg13_8xb32_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg13bn_8xb32_in1k.py b/configs/vgg/vgg13bn_8xb32_in1k.py
index 3cb3592b..55096eb3 100644
--- a/configs/vgg/vgg13bn_8xb32_in1k.py
+++ b/configs/vgg/vgg13bn_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256.py',
     '../_base_/default_runtime.py',
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg16_8xb16_voc.py b/configs/vgg/vgg16_8xb16_voc.py
index 22b1891d..e4541979 100644
--- a/configs/vgg/vgg16_8xb16_voc.py
+++ b/configs/vgg/vgg16_8xb16_voc.py
@@ -36,3 +36,8 @@ param_scheduler = dict(type='StepLR', by_epoch=True, step_size=20, gamma=0.1)
 train_cfg = dict(by_epoch=True, max_epochs=40, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/configs/vgg/vgg16_8xb32_in1k.py b/configs/vgg/vgg16_8xb32_in1k.py
index a291da28..80c8f11b 100644
--- a/configs/vgg/vgg16_8xb32_in1k.py
+++ b/configs/vgg/vgg16_8xb32_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg16bn_8xb32_in1k.py b/configs/vgg/vgg16bn_8xb32_in1k.py
index f6bbb81b..fce036c8 100644
--- a/configs/vgg/vgg16bn_8xb32_in1k.py
+++ b/configs/vgg/vgg16bn_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256.py',
     '../_base_/default_runtime.py',
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg19_8xb32_in1k.py b/configs/vgg/vgg19_8xb32_in1k.py
index 88cd24c1..50bcb2cc 100644
--- a/configs/vgg/vgg19_8xb32_in1k.py
+++ b/configs/vgg/vgg19_8xb32_in1k.py
@@ -7,3 +7,8 @@ _base_ = [
 
 # schedule settings
 optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vgg/vgg19bn_8xb32_in1k.py b/configs/vgg/vgg19bn_8xb32_in1k.py
index 4b4f34ab..b9defa93 100644
--- a/configs/vgg/vgg19bn_8xb32_in1k.py
+++ b/configs/vgg/vgg19bn_8xb32_in1k.py
@@ -4,3 +4,8 @@ _base_ = [
     '../_base_/schedules/imagenet_bs256.py',
     '../_base_/default_runtime.py',
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py b/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py
index 539fecc7..a4cd4c92 100644
--- a/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py
+++ b/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py
@@ -36,3 +36,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py b/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py
index 68e8b02b..4dbd9a46 100644
--- a/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py
+++ b/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py
@@ -51,3 +51,8 @@ optim_wrapper = dict(
 
 # runtime settings
 custom_hooks = [dict(type='EMAHook', momentum=1e-4)]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-base-p16_pt-64xb64_in1k-224.py b/configs/vision_transformer/vit-base-p16_pt-64xb64_in1k-224.py
index 0a9e5156..d25281b8 100644
--- a/configs/vision_transformer/vit-base-p16_pt-64xb64_in1k-224.py
+++ b/configs/vision_transformer/vit-base-p16_pt-64xb64_in1k-224.py
@@ -13,3 +13,8 @@ model = dict(
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py b/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py
index 2322c226..53ae6adf 100644
--- a/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py
+++ b/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py
@@ -36,3 +36,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-base-p32_pt-64xb64_in1k-224.py b/configs/vision_transformer/vit-base-p32_pt-64xb64_in1k-224.py
index 83a92fca..f6ba5174 100644
--- a/configs/vision_transformer/vit-base-p32_pt-64xb64_in1k-224.py
+++ b/configs/vision_transformer/vit-base-p32_pt-64xb64_in1k-224.py
@@ -13,3 +13,8 @@ model = dict(
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py b/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
index 917117c7..e4e4b25f 100644
--- a/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
+++ b/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
@@ -36,3 +36,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-large-p16_pt-64xb64_in1k-224.py b/configs/vision_transformer/vit-large-p16_pt-64xb64_in1k-224.py
index 0cf9d8e1..e10336aa 100644
--- a/configs/vision_transformer/vit-large-p16_pt-64xb64_in1k-224.py
+++ b/configs/vision_transformer/vit-large-p16_pt-64xb64_in1k-224.py
@@ -13,3 +13,8 @@ model = dict(
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-large-p32_ft-64xb64_in1k-384.py b/configs/vision_transformer/vit-large-p32_ft-64xb64_in1k-384.py
index 65bdf41e..5afd05de 100644
--- a/configs/vision_transformer/vit-large-p32_ft-64xb64_in1k-384.py
+++ b/configs/vision_transformer/vit-large-p32_ft-64xb64_in1k-384.py
@@ -36,3 +36,8 @@ test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/vision_transformer/vit-large-p32_pt-64xb64_in1k-224.py b/configs/vision_transformer/vit-large-p32_pt-64xb64_in1k-224.py
index c1b5a3d8..fc31b490 100644
--- a/configs/vision_transformer/vit-large-p32_pt-64xb64_in1k-224.py
+++ b/configs/vision_transformer/vit-large-p32_pt-64xb64_in1k-224.py
@@ -13,3 +13,8 @@ model = dict(
 
 # schedule setting
 optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/wrn/wide-resnet101_8xb32_in1k.py b/configs/wrn/wide-resnet101_8xb32_in1k.py
index d1bf5e5e..34d02f33 100644
--- a/configs/wrn/wide-resnet101_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet101_8xb32_in1k.py
@@ -5,3 +5,8 @@ _base_ = [
 ]
 
 model = dict(backbone=dict(depth=101))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/wrn/wide-resnet50_8xb32_in1k.py b/configs/wrn/wide-resnet50_8xb32_in1k.py
index edf6a051..0de276ba 100644
--- a/configs/wrn/wide-resnet50_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet50_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_resize.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/wrn/wide-resnet50_timm_8xb32_in1k.py b/configs/wrn/wide-resnet50_timm_8xb32_in1k.py
index 8dca8f37..393ec168 100644
--- a/configs/wrn/wide-resnet50_timm_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet50_timm_8xb32_in1k.py
@@ -3,3 +3,8 @@ _base_ = [
     '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
     '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
 ]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=1024)