mmpretrain/configs/mae/benchmarks/vit-large-p16_8xb128-ds-zero3-coslr-50e_in1k.py

_base_ = ['./vit-large-p16_8xb128-coslr-50e_in1k.py']

# optimizer wrapper
optim_wrapper = dict(type='DeepSpeedOptimWrapper')

# training strategy
# Deepspeed with ZeRO3 + fp16
strategy = dict(
    type='DeepSpeedStrategy',
    fp16=dict(
        enabled=True,
        fp16_master_weights_and_grads=False,
        loss_scale=0,
        loss_scale_window=500,
        hysteresis=2,
        min_loss_scale=1,
        initial_scale_power=15,
    ),
    inputs_to_half=['inputs'],
    zero_optimization=dict(
        stage=3,
        allgather_partitions=True,
        reduce_scatter=True,
        allgather_bucket_size=50000000,
        reduce_bucket_size=50000000,
        overlap_comm=True,
        contiguous_gradients=True,
        cpu_offload=False,
    ))

# runner which supports strategies
runner_type = 'FlexibleRunner'
[Enhancement] Support deepspeed with flexible runner (#1673) * [Feature] Support deepspeed with flexible runner * [Fix] Reformat with yapf * [Refacor] Rename configs * [Fix] Reformat with yapf * [Refactor] Remove unused keys * [Refactor] Change the _base_ path * [Refactor] Reformat 2023-06-29 10:16:27 +08:00			`_base_ = ['./vit-large-p16_8xb128-coslr-50e_in1k.py']`

			`# optimizer wrapper`
			`optim_wrapper = dict(type='DeepSpeedOptimWrapper')`

			`# training strategy`
			`# Deepspeed with ZeRO3 + fp16`
			`strategy = dict(`
			`type='DeepSpeedStrategy',`
			`fp16=dict(`
			`enabled=True,`
			`fp16_master_weights_and_grads=False,`
			`loss_scale=0,`
			`loss_scale_window=500,`
			`hysteresis=2,`
			`min_loss_scale=1,`
			`initial_scale_power=15,`
			`),`
			`inputs_to_half=['inputs'],`
			`zero_optimization=dict(`
			`stage=3,`
			`allgather_partitions=True,`
			`reduce_scatter=True,`
			`allgather_bucket_size=50000000,`
			`reduce_bucket_size=50000000,`
			`overlap_comm=True,`
			`contiguous_gradients=True,`
			`cpu_offload=False,`
			`))`

			`# runner which supports strategies`
			`runner_type = 'FlexibleRunner'`