diff --git a/docs/source/tutorials/nni_hpo_dlc.md b/docs/source/tutorials/nni_hpo_dlc.md new file mode 100644 index 00000000..464f68d4 --- /dev/null +++ b/docs/source/tutorials/nni_hpo_dlc.md @@ -0,0 +1,134 @@ +# NNI HPO dlc tutorial + +Auto hyperparameter optimization (HPO), or auto tuning, is one of the key features of NNI. This tutorial shows an example of EasyCV for dlc using NNI HPO. + +## Create environment + +Create NAS disks, NAS datasets, and DSW/ECS (ps: Note that the three parts are created in the same region). + +Mount NAS disks on DSW/ECS (ps: The address where the NAS is mounted can be the same as the mount path /mnt/data where the NAS data set is created to avoid errors). + +For details about the create environment, see https://yuque.antfin.com/pai-user/manual/rwk4sh. + +## Installation + +```shell +hpo_tools: +pip install https://automl-nni.oss-cn-beijing.aliyuncs.com/nni/hpo_tools/hpo_tools-0.1.1-py3-none-any.whl + +dlc_tools: +wget https://automl-nni.oss-cn-beijing.aliyuncs.com/nni/hpo_tools/scripts/install_dlc.sh +source install_dlc.sh /mnt/data https://dlc-tools.oss-cn-zhangjiakou.aliyuncs.com/release/linux/dlc?spm=a2c4g.11186623.0.0.1b9b4a35er7EfB +(ps: install_dlc.sh has two inputs. The first input specifies the default path where the dlc tool is installed, and the second input specifies the url link to the dlc tool. +/mnt/data is the root directory where the EasyCV code resides.) + +# test +cd /mnt/data/software +dlc --help +``` + +## RUN +Take easycv/toolkit/hpo/search/det/ as an example + +```shell +cd EasyCV/easycv/toolkit/hpo/det/ + +nnictl create --config config_dlc.yml --port=8780 + + +## STOP +nnictl stop +``` + +For more nnictl usage, see https://nni.readthedocs.io/en/v2.1/Tutorial/QuickStart.html. + +## config_dlc.yml file parameter meaning +```shell +experimentWorkingDirectory: ./expdir +searchSpaceFile: search_space.json +trialCommand: python3 ../common/run.py --config=./config_dlc.ini +trialConcurrency: 1 +maxTrialNumber: 4 +debug: true +logLevel: debug +trainingService: + platform: local +tuner: + name: TPE + classArgs: + optimize_mode: maximize +assessor: + codeDirectory: hpo_tools的安装根目录/hpo_tools/core/assessor + className: dlc_assessor.DLCAssessor + classArgs: + optimize_mode: maximize + start_step: 2 +``` +
+Arguments + +- `ExperimentWorkingDirectory`: the save directory +- `searchSpaceFile`: the search space +- `trialCommand`: startup scripts run.py(--config specified config path) +- `trainingService.platform`: the training platform +- `tuner`: the tuner algorithm +- `assessor`: the assessor algorithm +- `classArgs`: the algorithm parameters + +
+ +The search space can reference: https://nni.readthedocs.io/en/v2.2/Tutorial/SearchSpaceSpec.html. + +## config_dlc.ini file parameter meaning +```shell +[cmd_config] +cmd1="dlc config --access_id xxx --access_key xxx --endpoint 'pai-dlc.cn-shanghai.aliyuncs.com' --region cn-shanghai" +cmd2="dlc submit pytorch --name=test_nni_${exp_id}_${trial_id} \ + --workers=1 \ + --worker_cpu=12 \ + --worker_gpu=1 \ + --worker_memory=10Gi \ + --worker_spec='ecs.gn6v-c10g1.20xlarge' \ + --data_sources='d-domlyt834bngpr68iu' \ + --worker_image=registry-vpc.cn-shanghai.aliyuncs.com/mybigpai/nni:0.0.3 \ + --command='cd ../../../../../ && pip install mmcv-full && pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple \ + && CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --master_port=29400 tools/train.py easycv/toolkit/hpo/search/det/fcos_r50_torch_1x_coco.py --work_dir easycv/toolkit/hpo/search/det/model/model_${exp_id}_${trial_id} --launcher pytorch --seed 42 --deterministic --user_config_params --data_root /root/data/coco/ --data.imgs_per_gpu ${batch_size} --optimizer.lr ${lr} ' \ + --workspace_id='255705' " + +[metric_config] +metric_filepath=easycv/toolkit/hpo/search/det/model/model_${exp_id}_${trial_id}/tf_logs +val/DetectionBoxes_Precision/mAP=100 +``` +
+Arguments + +cmd1 specifies the area for the dlc, and cmd2 is the dlc startup command. + +[cmd_config] + +It needs to be modified according to the dlc environment(For details about the dlc command parameters, see https://yuque.antfin-inc.com/pai-user/manual/eo7doa.) +- `access_id and access_key`: the ak information +- `endpoint`: the port +- `region`: the region +- `name`: the experiment name +- `workers`: the number of machines +- `worker_cpu`: the number of cpus +- `worker_gpu`: the number of gpus +- `worker_memory`: the number of memory required +- `worker_spec`: the model of the machine +- `data_sources`: mapping mounts the nas, and the dlc is started using the data_sources code +- `worker_image`: the image to use +- `workspace_id`: the workspace + +It does not need to be modified according to the dlc environment +- `command`: the command to start the easycv experiment +- `user_config_param`: parameter is selected from searchspace.json + +[metric_config] +- `metric_filepath`: tf_logs directory saved for the experiment and used to obtain the parameters of the hpo evaluation + +For example, the above example uses the detected map as the evaluation parameter, with a maximum value of 100. + +
+ +Tuning method can be reference NNI way of use: https://nni.readthedocs.io/en/v2.1/Overview.html. diff --git a/docs/source/tutorials/nni_hpo_local.md b/docs/source/tutorials/nni_hpo_local.md new file mode 100644 index 00000000..a54c9e9b --- /dev/null +++ b/docs/source/tutorials/nni_hpo_local.md @@ -0,0 +1,97 @@ +# NNI HPO local tutorial + +Auto hyperparameter optimization (HPO), or auto tuning, is one of the key features of NNI. This tutorial shows an example of EasyCV for local using NNI HPO. + +## Create environment + +Create DSW/ECS. + +For details about the create environment, see https://yuque.antfin.com/pai-user/manual/rwk4sh. + +## Installation + +```shell +hpo_tools: +pip install https://automl-nni.oss-cn-beijing.aliyuncs.com/nni/hpo_tools/hpo_tools-0.1.1-py3-none-any.whl + +``` + +## RUN +Take easycv/toolkit/hpo/search/det/ as an example + +```shell +cd EasyCV/easycv/toolkit/hpo/det/ + +nnictl create --config config_local.yml --port=8780 + +## STOP +nnictl stop +``` + +For more nnictl usage, see https://nni.readthedocs.io/en/v2.1/Tutorial/QuickStart.html. + +## config_local.yml file parameter meaning +```shell +experimentWorkingDirectory: ./expdir +searchSpaceFile: search_space.json +trialCommand: python3 ../common/run.py --config=./config_local.ini +trialConcurrency: 1 +maxTrialNumber: 4 +debug: true +logLevel: debug +trainingService: + platform: local +tuner: + name: TPE + classArgs: + optimize_mode: maximize +assessor: + codeDirectory: /root/anaconda3/lib/python3.9/site-packages/hpo_tools/core/assessor + className: dlc_assessor.DLCAssessor + classArgs: + optimize_mode: maximize + start_step: 2 + moving_avg: true + proportion: 0.6 + patience: 2 +``` +
+Arguments + +- `ExperimentWorkingDirectory`: the save directory +- `searchSpaceFile`: the search space +- `trialCommand`: startup scripts run.py(--config specified config path) +- `trainingService.platform`: the training platform +- `tuner`: the tuner algorithm +- `assessor`: the assessor algorithm +- `classArgs`: the algorithm parameters + +
+ +The search space can reference: https://nni.readthedocs.io/en/v2.2/Tutorial/SearchSpaceSpec.html. + +## config_local.ini file parameter meaning +```shell +[cmd_config] +cmd1='cd /mnt/data/EasyCV && CUDA_VISIBLE_DEVICES=0,1,2,3,4 python -m torch.distributed.launch --nproc_per_node=4 --master_port=29400 tools/train.py easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py --work_dir easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id} --launcher pytorch --seed 42 --deterministic --user_config_params --data_root /mnt/data/coco/ --data.imgs_per_gpu ${batch_size} --optimizer.lr ${lr} ' + +[metric_config] +metric_filepath=easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id}/tf_logs +val/DetectionBoxes_Precision/mAP=100 +``` +
+Arguments + +cmd1 is a local run command. + +[cmd_config] +- `user_config_param`: parameter is selected from searchspace.json + +[metric_config] +- `metric_filepath`: tf_logs directory saved for the experiment and used to obtain the parameters of the hpo evaluation + +For example, the above example uses the detected map as the evaluation parameter, with a maximum value of 100. + +
+ +Tuning method can be reference NNI way of use: https://nni.readthedocs.io/en/v2.1/Overview.html. diff --git a/easycv/toolkit/hpo/common/run.py b/easycv/toolkit/hpo/common/run.py new file mode 100644 index 00000000..9f5c370c --- /dev/null +++ b/easycv/toolkit/hpo/common/run.py @@ -0,0 +1,93 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import argparse +import logging +import os + +import nni +from hpo_tools.core.metric.report_summary_metric import report_result +from hpo_tools.core.platform.dlc.dlc_utils import kill_job, run_multi_command +from hpo_tools.core.utils.config_utils import parse_ini +from hpo_tools.core.utils.json_utils import set_value +from hpo_tools.core.utils.path_utils import unique_path + + +def get_params(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--config', type=str, help='config path', default='./config_oss.ini') + args, _ = parser.parse_known_args() + return args + + +if __name__ == '__main__': + + try: + args = get_params() + logging.info('args: %s', args) + + config = parse_ini(args.config) + + cmd_config = config['cmd_config'] + logging.info('cmd_config: %s', cmd_config) + + oss_config = config.get('oss_config', None) + + # get parameters form tuner + tuner_params = nni.get_next_parameter() + trial_id = str(nni.get_trial_id()) + experment_id = str(nni.get_experiment_id()) + + # update parameter + for k, cmd in cmd_config.items(): + cmd = cmd.replace('${exp_id}', experment_id) + cmd = cmd.replace('${trial_id}', trial_id) + tuner_params_list = '' + tuner_params_dict = '' + for p, v in tuner_params.items(): + cmd = cmd.replace(p, str(v)) + tuner_params_list += p + ' ' + str(v) + ' ' + tuner_params_dict += p + '=' + str(v) + ' ' + cmd = cmd.replace('${tuner_params_list}', tuner_params_list) + cmd = cmd.replace('${tuner_params_dict}', tuner_params_dict) + cmd_config[k] = cmd + + # report metric + metric_dict = config['metric_config'] + logging.info('metric dict: %s', metric_dict) + metric_filepath = metric_dict['metric_filepath'] + metric_filepath = metric_filepath.replace('${exp_id}', experment_id) + metric_filepath = metric_filepath.replace('${trial_id}', trial_id) + metric_dict.pop('metric_filepath') + + if metric_filepath.startswith('oss'): + dst_filepath = unique_path('../exp') + set_value( + 'expdir', os.path.abspath(dst_filepath), trial_id=trial_id) + ori_filepath = metric_filepath + else: + ori_filepath = None + dst_filepath = metric_filepath + + report_result( + ori_filepath, + dst_filepath, + metric_dict, + trial_id, + use_best=True, + oss_config=oss_config) + + # for earlystop or user_canceled + nni.report_intermediate_result(0) + + # run command + run_multi_command(cmd_config, trial_id) + + except Exception: + logging.exception('run begin error') + exit(1) + + finally: + # kill instance + kill_job(trial_id=trial_id) + # for kill report result + set_value(trial_id + '_exit', '1', trial_id=trial_id) diff --git a/easycv/toolkit/hpo/det/config_dlc.ini b/easycv/toolkit/hpo/det/config_dlc.ini new file mode 100644 index 00000000..dd2b2a9c --- /dev/null +++ b/easycv/toolkit/hpo/det/config_dlc.ini @@ -0,0 +1,17 @@ +[cmd_config] +cmd1="dlc config --access_id xxx --access_key xxx --endpoint 'pai-dlc.cn-shanghai.aliyuncs.com' --region cn-shanghai" +cmd2="dlc submit pytorch --name=test_nni_${exp_id}_${trial_id} \ + --workers=1 \ + --worker_cpu=12 \ + --worker_gpu=1 \ + --worker_memory=10Gi \ + --worker_spec='ecs.gn6v-c10g1.20xlarge' \ + --data_sources='d-domlyt834bngpr68iu' \ + --worker_image=registry-vpc.cn-shanghai.aliyuncs.com/mybigpai/nni:0.0.3 \ + --command='cd /mnt/data/EasyCV && pip install mmcv-full && pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple \ + && CUDA_VISIBLE_DEVICES=0,1,2,3,4 python -m torch.distributed.launch --nproc_per_node=4 --master_port=29400 tools/train.py easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py --work_dir easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id} --launcher pytorch --seed 42 --deterministic --user_config_params --data_root /mnt/data/coco/ --data.imgs_per_gpu ${batch_size} --optimizer.lr ${lr} ' \ + --workspace_id='255705' " + +[metric_config] +metric_filepath=easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id}/tf_logs +val/DetectionBoxes_Precision/mAP=100 diff --git a/easycv/toolkit/hpo/det/config_dlc.yml b/easycv/toolkit/hpo/det/config_dlc.yml new file mode 100644 index 00000000..b5b65996 --- /dev/null +++ b/easycv/toolkit/hpo/det/config_dlc.yml @@ -0,0 +1,19 @@ +experimentWorkingDirectory: ./expdir +searchSpaceFile: search_space.json +trialCommand: python3 ../common/run.py --config=./config_dlc.ini +trialConcurrency: 1 +maxTrialNumber: 4 +debug: true +logLevel: debug +trainingService: + platform: local +tuner: + name: TPE + classArgs: + optimize_mode: maximize +assessor: + codeDirectory: /root/anaconda3/lib/python3.9/site-packages/hpo_tools/core/assessor + className: dlc_assessor.DLCAssessor + classArgs: + optimize_mode: maximize + start_step: 2 diff --git a/easycv/toolkit/hpo/det/config_local.ini b/easycv/toolkit/hpo/det/config_local.ini new file mode 100644 index 00000000..189a87ea --- /dev/null +++ b/easycv/toolkit/hpo/det/config_local.ini @@ -0,0 +1,6 @@ +[cmd_config] +cmd1='cd /mnt/data/EasyCV && CUDA_VISIBLE_DEVICES=0,1,2,3,4 python -m torch.distributed.launch --nproc_per_node=4 --master_port=29400 tools/train.py easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py --work_dir easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id} --launcher pytorch --seed 42 --deterministic --user_config_params --data_root /mnt/data/coco/ --data.imgs_per_gpu ${batch_size} --optimizer.lr ${lr} ' + +[metric_config] +metric_filepath=easycv/toolkit/hpo/det/model/model_${exp_id}_${trial_id}/tf_logs +val/DetectionBoxes_Precision/mAP=100 diff --git a/easycv/toolkit/hpo/det/config_local.yml b/easycv/toolkit/hpo/det/config_local.yml new file mode 100644 index 00000000..473e1993 --- /dev/null +++ b/easycv/toolkit/hpo/det/config_local.yml @@ -0,0 +1,24 @@ +experimentWorkingDirectory: ./expdir +searchSpaceFile: search_space.json +trialCommand: python3 ../common/run.py --config=./config_local.ini +# trialGpuNumber: 4 +trialConcurrency: 1 +maxTrialNumber: 4 +debug: true +logLevel: debug +trainingService: + platform: local + # useActiveGpu: true +tuner: + name: TPE + classArgs: + optimize_mode: maximize +assessor: + codeDirectory: /root/anaconda3/lib/python3.9/site-packages/hpo_tools/core/assessor + className: dlc_assessor.DLCAssessor + classArgs: + optimize_mode: maximize + start_step: 2 + moving_avg: true + proportion: 0.6 + patience: 2 diff --git a/easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py b/easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py new file mode 100644 index 00000000..402b11d9 --- /dev/null +++ b/easycv/toolkit/hpo/det/fcos_r50_torch_1x_coco.py @@ -0,0 +1,192 @@ +train_cfg = {} +test_cfg = {} +optimizer_config = dict() # grad_clip, coalesce, bucket_size_mb +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +dist_params = dict(backend='nccl') +cudnn_benchmark = False +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] + +CLASSES = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush' +] + +# dataset settings +data_root = '/mnt/data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='MMResize', img_scale=(1333, 800), keep_ratio=True), + dict(type='MMRandomFlip', flip_ratio=0.5), + dict(type='MMNormalize', **img_norm_cfg), + dict(type='MMPad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels'], + meta_keys=('filename', 'ori_filename', 'ori_shape', 'ori_img_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg')) +] +test_pipeline = [ + dict( + type='MMMultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='MMResize', keep_ratio=True), + dict(type='MMRandomFlip'), + dict(type='MMNormalize', **img_norm_cfg), + dict(type='MMPad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'ori_img_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'flip_direction', + 'img_norm_cfg')) + ]) +] + +train_dataset = dict( + type='DetDataset', + data_source=dict( + type='DetSourceCoco', + ann_file='${data_root}' + 'annotations/instances_train2017.json', + img_prefix='${data_root}' + 'train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + classes=CLASSES, + test_mode=False, + filter_empty_gt=True, + iscrowd=False), + pipeline=train_pipeline) + +val_dataset = dict( + type='DetDataset', + imgs_per_gpu=1, + data_source=dict( + type='DetSourceCoco', + ann_file='${data_root}' + 'annotations/instances_val2017.json', + img_prefix='${data_root}' + 'val2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + classes=CLASSES, + test_mode=True, + filter_empty_gt=False, + iscrowd=True), + pipeline=test_pipeline) + +data = dict( + imgs_per_gpu=2, workers_per_gpu=2, train=train_dataset, val=val_dataset) + +# evaluation +eval_config = dict(interval=1, gpu_collect=False) +eval_pipelines = [ + dict( + mode='test', + evaluators=[ + dict(type='CocoDetectionEvaluator', classes=CLASSES), + ], + ) +] + +# model settings +model = dict( + type='Detection', + pretrained=True, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3, 4), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + center_sampling=True, + center_sample_radius=1.5, + norm_on_bbox=True, + centerness_on_reg=True, + conv_cfg=None, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_bias=True, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100))) + +checkpoint_config = dict(interval=10) +# optimizer +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0001, + paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) + +total_epochs = 12 + +find_unused_parameters = False diff --git a/easycv/toolkit/hpo/det/search_space.json b/easycv/toolkit/hpo/det/search_space.json new file mode 100644 index 00000000..3306cee9 --- /dev/null +++ b/easycv/toolkit/hpo/det/search_space.json @@ -0,0 +1,4 @@ +{ + "${batch_size}": {"_type":"choice", "_value": [1, 2, 4]}, + "${lr}":{"_type":"choice","_value":[0.0001, 0.001, 0.01]} +} diff --git a/tools/train.py b/tools/train.py index 96f93db8..0c1f4563 100644 --- a/tools/train.py +++ b/tools/train.py @@ -130,7 +130,7 @@ def main(): cfg = mmcv_config_fromfile(args.config) if args.user_config_params is not None: - assert args.model_type is not None, 'model_type must be setted' + # assert args.model_type is not None, 'model_type must be setted' # rebuild config by user config params cfg = rebuild_config(cfg, args.user_config_params)