diff --git a/.gitignore b/.gitignore index 1828f7e8..f63cdea2 100644 --- a/.gitignore +++ b/.gitignore @@ -137,6 +137,3 @@ pai_jobs/easycv/resources/ *.tar.gz thirdparty/test scripts/test - -# easycv default cache dir -.easycv_cache diff --git a/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py b/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py index 2d58705e..0e6ef6bc 100644 --- a/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py +++ b/benchmarks/selfsup/classification/imagenet/dino_deit_small_p16_8xb2048_20e_feature.py @@ -10,7 +10,7 @@ oss_io_config = dict( buckets=['your oss buckets']) # model settings -# 1920: merge 4 layers of features, open models/backbones/vit_transfomer_dynamic.py:311: self.forward_return_n_last_blocks +# 1920: merge 4 layers of features, open models/backbones/vit_transformer_dynamic.py:311: self.forward_return_n_last_blocks # 384: default feature_num = 1920 model = dict( diff --git a/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py b/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py index 31694589..fa008b3e 100644 --- a/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py +++ b/benchmarks/selfsup/classification/imagenet/mae_vit_large_patch16_8xb16_50e_lrdecay075_fintune.py @@ -157,3 +157,6 @@ checkpoint_config = dict(interval=10) # runtime settings total_epochs = 50 + +# export config +export = dict(export_neck=True) diff --git a/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py b/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py index 4c81e184..dcb45d31 100644 --- a/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py +++ b/benchmarks/selfsup/classification/imagenet/moby_deit_small_p16_8xb2048_30e_feature.py @@ -10,7 +10,7 @@ oss_io_config = dict( buckets=['your oss buckets']) # model settings -# 1920: merge 4 layers of features, open models/backbones/vit_transfomer_dynamic.py:311: self.forward_return_n_last_blocks +# 1920: merge 4 layers of features, open models/backbones/vit_transformer_dynamic.py:311: self.forward_return_n_last_blocks # 384: default feature_num = 1920 model = dict( diff --git a/benchmarks/tools/extract.py b/benchmarks/tools/extract.py index 1aff8fa6..9214a282 100644 --- a/benchmarks/tools/extract.py +++ b/benchmarks/tools/extract.py @@ -15,6 +15,7 @@ from mmcv.runner import get_dist_info, init_dist, load_checkpoint from easycv.apis import set_random_seed from easycv.datasets import build_dataloader, build_dataset from easycv.file import io +from easycv.framework.errors import ValueError from easycv.models import build_model from easycv.utils.collect import dist_forward_collect, nondist_forward_collect from easycv.utils.config_tools import mmcv_config_fromfile diff --git a/benchmarks/tools/extract_backbone_weights.py b/benchmarks/tools/extract_backbone_weights.py index d3eb38f1..976611ee 100644 --- a/benchmarks/tools/extract_backbone_weights.py +++ b/benchmarks/tools/extract_backbone_weights.py @@ -3,6 +3,8 @@ import argparse import torch +from easycv.framework.errors import ValueError + def parse_args(): parser = argparse.ArgumentParser( @@ -24,7 +26,7 @@ def main(): output_dict['state_dict'][key[9:]] = value has_backbone = True if not has_backbone: - raise Exception('Cannot find a backbone module in the checkpoint.') + raise ValueError('Cannot find a backbone module in the checkpoint.') torch.save(output_dict, args.output) diff --git a/benchmarks/tools/linear_eval.py b/benchmarks/tools/linear_eval.py index 2f74e191..a892e4bf 100644 --- a/benchmarks/tools/linear_eval.py +++ b/benchmarks/tools/linear_eval.py @@ -2,11 +2,12 @@ import argparse import os import shutil -import sys import time import torch +from easycv.framework.errors import ValueError + args = argparse.ArgumentParser(description='Process some integers.') args.add_argument( 'model_path', @@ -88,7 +89,7 @@ def extract_model(model_path): output_dict['state_dict'][key[9:]] = value has_backbone = True if not has_backbone: - raise Exception('Cannot find a backbone module in the checkpoint.') + raise ValueError('Cannot find a backbone module in the checkpoint.') torch.save(output_dict, backbone_file) return backbone_file diff --git a/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py b/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py index 03124f20..a81e4ae2 100644 --- a/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py +++ b/configs/classification/imagenet/resnet/resnet50_b32x8_100e_jpg.py @@ -86,3 +86,13 @@ checkpoint_config = dict(interval=10) # runtime settings total_epochs = 100 + +predict = dict( + type='ClassificationPredictor', + pipelines=[ + dict(type='Resize', size=256), + dict(type='CenterCrop', size=224), + dict(type='ToTensor'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Collect', keys=['img']) + ]) diff --git a/configs/classification/imagenet/vit/deitiii_base_patch16_192.py b/configs/classification/imagenet/vit/deitiii_base_patch16_192.py new file mode 100644 index 00000000..46b620f1 --- /dev/null +++ b/configs/classification/imagenet/vit/deitiii_base_patch16_192.py @@ -0,0 +1,143 @@ +# from PIL import Image + +_base_ = 'configs/base.py' + +log_config = dict( + interval=10, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) + +# model settings +model = dict( + type='Classification', + train_preprocess=['mixUp'], + pretrained=False, + mixup_cfg=dict( + mixup_alpha=0.8, + cutmix_alpha=1.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + label_smoothing=0.0, + num_classes=1000), + backbone=dict( + type='VisionTransformer', + img_size=[192], + num_classes=1000, + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + drop_rate=0., + drop_path_rate=0.2, + use_layer_scale=True), + head=dict( + type='ClsHead', + loss_config=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + label_ceil=True), + with_fc=False, + use_num_classes=False)) + +data_train_list = 'data/imagenet1k/train.txt' +data_train_root = 'data/imagenet1k/train/' +data_test_list = 'data/imagenet1k/val.txt' +data_test_root = 'data/imagenet1k/val/' + +dataset_type = 'ClsDataset' +img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) +three_augment_policies = [[ + dict(type='PILGaussianBlur', prob=1.0, radius_min=0.1, radius_max=2.0), +], [ + dict(type='Solarization', threshold=128), +], [ + dict(type='Grayscale', num_output_channels=3), +]] +train_pipeline = [ + dict( + type='RandomResizedCrop', size=192, scale=(0.08, 1.0), + interpolation=3), # interpolation='bicubic' + dict(type='RandomHorizontalFlip'), + dict(type='MMAutoAugment', policies=three_augment_policies), + dict(type='ColorJitter', brightness=0.3, contrast=0.3, saturation=0.3), + dict(type='ToTensor'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Collect', keys=['img', 'gt_labels']) +] +size = int((256 / 224) * 192) +test_pipeline = [ + dict(type='Resize', size=size, interpolation=3), + dict(type='CenterCrop', size=192), + dict(type='ToTensor'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Collect', keys=['img', 'gt_labels']) +] + +data = dict( + imgs_per_gpu=256, + workers_per_gpu=8, + use_repeated_augment_sampler=True, + train=dict( + type=dataset_type, + data_source=dict( + list_file=data_train_list, + root=data_train_root, + type='ClsSourceImageList'), + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_source=dict( + list_file=data_test_list, + root=data_test_root, + type='ClsSourceImageList'), + pipeline=test_pipeline)) + +eval_config = dict(initial=True, interval=1, gpu_collect=True) +eval_pipelines = [ + dict( + mode='test', + data=data['val'], + dist_eval=True, + evaluators=[dict(type='ClsEvaluator', topk=(1, 5))], + ) +] + +# additional hooks +custom_hooks = [] + +# optimizer +optimizer = dict( + type='Lamb', + lr=0.003, + weight_decay=0.05, + eps=1e-8, + paramwise_options={ + 'cls_token': dict(weight_decay=0.), + 'pos_embed': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'gamma_1': dict(weight_decay=0.), + 'gamma_2': dict(weight_decay=0.), + }) +optimizer_config = dict(grad_clip=None, update_interval=1) + +lr_config = dict( + policy='CosineAnnealingWarmupByEpoch', + by_epoch=True, + min_lr_ratio=0.00001 / 0.003, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=5, + warmup_ratio=0.000001 / 0.003, +) +checkpoint_config = dict(interval=10) + +# runtime settings +total_epochs = 800 + +ema = dict(decay=0.99996) diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py new file mode 100644 index 00000000..5a35f946 --- /dev/null +++ b/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py @@ -0,0 +1,17 @@ +_base_ = './deitiii_base_patch16_192.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='VisionTransformer', + img_size=[192], + num_classes=1000, + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + drop_rate=0., + drop_path_rate=0.2, + use_layer_scale=True)) diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py new file mode 100644 index 00000000..4c82cf9a --- /dev/null +++ b/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py @@ -0,0 +1,17 @@ +_base_ = './deitiii_base_patch16_192.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='VisionTransformer', + img_size=[192], + num_classes=1000, + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + drop_rate=0., + drop_path_rate=0.45, + use_layer_scale=True)) diff --git a/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py b/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py new file mode 100644 index 00000000..9ba9cf77 --- /dev/null +++ b/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py @@ -0,0 +1,86 @@ +_base_ = './deitiii_base_patch16_192.py' +# model settings +model = dict( + type='Classification', + backbone=dict( + type='VisionTransformer', + img_size=[224], + num_classes=1000, + patch_size=16, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + qkv_bias=True, + drop_rate=0., + drop_path_rate=0.05, + use_layer_scale=True)) + +data_train_list = 'data/imagenet1k/train.txt' +data_train_root = 'data/imagenet1k/train/' +data_test_list = 'data/imagenet1k/val.txt' +data_test_root = 'data/imagenet1k/val/' + +dataset_type = 'ClsDataset' +img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) +three_augment_policies = [[ + dict(type='PILGaussianBlur', prob=1.0, radius_min=0.1, radius_max=2.0), +], [ + dict(type='Solarization', threshold=128), +], [ + dict(type='Grayscale', num_output_channels=3), +]] +train_pipeline = [ + dict( + type='RandomResizedCrop', size=224, scale=(0.08, 1.0), + interpolation=3), # interpolation='bicubic' + dict(type='RandomHorizontalFlip'), + dict(type='MMAutoAugment', policies=three_augment_policies), + dict(type='ColorJitter', brightness=0.3, contrast=0.3, saturation=0.3), + dict(type='ToTensor'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Collect', keys=['img', 'gt_labels']) +] +test_pipeline = [ + dict(type='Resize', size=256, interpolation=3), + dict(type='CenterCrop', size=224), + dict(type='ToTensor'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Collect', keys=['img', 'gt_labels']) +] + +data = dict( + imgs_per_gpu=256, + workers_per_gpu=8, + use_repeated_augment_sampler=True, + train=dict( + type=dataset_type, + data_source=dict( + list_file=data_train_list, + root=data_train_root, + type='ClsSourceImageList'), + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_source=dict( + list_file=data_test_list, + root=data_test_root, + type='ClsSourceImageList'), + pipeline=test_pipeline)) + +eval_pipelines = [ + dict( + mode='test', + data=data['val'], + dist_eval=True, + evaluators=[dict(type='ClsEvaluator', topk=(1, 5))], + ) +] + +# optimizer +optimizer = dict(lr=0.004) + +lr_config = dict( + min_lr_ratio=0.00001 / 0.004, + warmup_ratio=0.000001 / 0.004, +) diff --git a/configs/detection/vitdet/lsj_coco_detection.py b/configs/detection/vitdet/lsj_coco_detection.py index f5da1064..fb243a23 100644 --- a/configs/detection/vitdet/lsj_coco_detection.py +++ b/configs/detection/vitdet/lsj_coco_detection.py @@ -101,13 +101,15 @@ val_dataset = dict( pipeline=test_pipeline) data = dict( - imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset) + imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset +) # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node) # evaluation -eval_config = dict(interval=1, gpu_collect=False) +eval_config = dict(initial=False, interval=1, gpu_collect=False) eval_pipelines = [ dict( mode='test', + # dist_eval=True, evaluators=[ dict(type='CocoDetectionEvaluator', classes=CLASSES), ], diff --git a/configs/detection/vitdet/lsj_coco_instance.py b/configs/detection/vitdet/lsj_coco_instance.py index a42aa040..5271363f 100644 --- a/configs/detection/vitdet/lsj_coco_instance.py +++ b/configs/detection/vitdet/lsj_coco_instance.py @@ -101,13 +101,15 @@ val_dataset = dict( pipeline=test_pipeline) data = dict( - imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset) + imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset +) # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node) # evaluation -eval_config = dict(interval=1, gpu_collect=False) +eval_config = dict(initial=False, interval=1, gpu_collect=False) eval_pipelines = [ dict( mode='test', + # dist_eval=True, evaluators=[ dict(type='CocoDetectionEvaluator', classes=CLASSES), dict(type='CocoMaskEvaluator', classes=CLASSES) diff --git a/configs/detection/vitdet/vitdet_basicblock_100e.py b/configs/detection/vitdet/vitdet_basicblock_100e.py deleted file mode 100644 index a3ea54e7..00000000 --- a/configs/detection/vitdet/vitdet_basicblock_100e.py +++ /dev/null @@ -1,3 +0,0 @@ -_base_ = './vitdet_100e.py' - -model = dict(backbone=dict(aggregation='basicblock')) diff --git a/configs/detection/vitdet/vitdet_bottleneck_100e.py b/configs/detection/vitdet/vitdet_bottleneck_100e.py deleted file mode 100644 index a6031797..00000000 --- a/configs/detection/vitdet/vitdet_bottleneck_100e.py +++ /dev/null @@ -1,3 +0,0 @@ -_base_ = './vitdet_100e.py' - -model = dict(backbone=dict(aggregation='bottleneck')) diff --git a/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py b/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py new file mode 100644 index 00000000..dfe0d68d --- /dev/null +++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py @@ -0,0 +1,231 @@ +# model settings + +norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True) + +pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth' +model = dict( + type='CascadeRCNN', + pretrained=pretrained, + backbone=dict( + type='ViTDet', + img_size=1024, + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + drop_path_rate=0.1, + window_size=14, + mlp_ratio=4, + qkv_bias=True, + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True), + neck=dict( + type='SFP', + in_channels=768, + out_channels=256, + scale_factors=(4.0, 2.0, 1.0, 0.5), + norm_cfg=norm_cfg, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + num_convs=2, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + norm_cfg=norm_cfg, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) + +mmlab_modules = [ + dict(type='mmdet', name='CascadeRCNN', module='model'), + dict(type='mmdet', name='RPNHead', module='head'), + dict(type='mmdet', name='CascadeRoIHead', module='head'), +] diff --git a/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py b/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py new file mode 100644 index 00000000..bbbc339f --- /dev/null +++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py @@ -0,0 +1,4 @@ +_base_ = [ + './vitdet_cascade_mask_rcnn.py', './lsj_coco_instance.py', + './vitdet_schedule_100e.py' +] diff --git a/configs/detection/vitdet/vitdet_faster_rcnn.py b/configs/detection/vitdet/vitdet_faster_rcnn.py index 48604d8b..0a00b397 100644 --- a/configs/detection/vitdet/vitdet_faster_rcnn.py +++ b/configs/detection/vitdet/vitdet_faster_rcnn.py @@ -1,6 +1,6 @@ # model settings -norm_cfg = dict(type='GN', num_groups=1, requires_grad=True) +norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True) pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth' model = dict( @@ -9,22 +9,32 @@ model = dict( backbone=dict( type='ViTDet', img_size=1024, + patch_size=16, embed_dim=768, depth=12, num_heads=12, + drop_path_rate=0.1, + window_size=14, mlp_ratio=4, qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.1, - use_abs_pos_emb=True, - aggregation='attn', - ), + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True), neck=dict( type='SFP', - in_channels=[768, 768, 768, 768], + in_channels=768, out_channels=256, + scale_factors=(4.0, 2.0, 1.0, 0.5), norm_cfg=norm_cfg, num_outs=5), rpn_head=dict( @@ -32,7 +42,6 @@ model = dict( in_channels=256, feat_channels=256, num_convs=2, - norm_cfg=norm_cfg, anchor_generator=dict( type='AnchorGenerator', scales=[8], @@ -98,7 +107,7 @@ model = dict( pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, - match_low_quality=True, + match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', diff --git a/configs/detection/vitdet/vitdet_faster_rcnn_100e.py b/configs/detection/vitdet/vitdet_faster_rcnn_100e.py index 5a43b575..bfeab9d1 100644 --- a/configs/detection/vitdet/vitdet_faster_rcnn_100e.py +++ b/configs/detection/vitdet/vitdet_faster_rcnn_100e.py @@ -1,4 +1,4 @@ _base_ = [ - './vitdet_faster_rcnn.py', './lsj_coco_detection.py', + './vitdet_faster_rcnn.py', './lsj_coco_instance.py', './vitdet_schedule_100e.py' ] diff --git a/configs/detection/vitdet/vitdet_mask_rcnn.py b/configs/detection/vitdet/vitdet_mask_rcnn.py index 890f6e8f..6b1ed1ce 100644 --- a/configs/detection/vitdet/vitdet_mask_rcnn.py +++ b/configs/detection/vitdet/vitdet_mask_rcnn.py @@ -1,6 +1,6 @@ # model settings -norm_cfg = dict(type='GN', num_groups=1, requires_grad=True) +norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True) pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth' model = dict( @@ -9,22 +9,32 @@ model = dict( backbone=dict( type='ViTDet', img_size=1024, + patch_size=16, embed_dim=768, depth=12, num_heads=12, + drop_path_rate=0.1, + window_size=14, mlp_ratio=4, qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.1, - use_abs_pos_emb=True, - aggregation='attn', - ), + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True), neck=dict( type='SFP', - in_channels=[768, 768, 768, 768], + in_channels=768, out_channels=256, + scale_factors=(4.0, 2.0, 1.0, 0.5), norm_cfg=norm_cfg, num_outs=5), rpn_head=dict( @@ -32,7 +42,6 @@ model = dict( in_channels=256, feat_channels=256, num_convs=2, - norm_cfg=norm_cfg, anchor_generator=dict( type='AnchorGenerator', scales=[8], @@ -112,7 +121,7 @@ model = dict( pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, - match_low_quality=True, + match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', diff --git a/configs/detection/vitdet/vitdet_100e.py b/configs/detection/vitdet/vitdet_mask_rcnn_100e.py similarity index 100% rename from configs/detection/vitdet/vitdet_100e.py rename to configs/detection/vitdet/vitdet_mask_rcnn_100e.py diff --git a/configs/detection/vitdet/vitdet_schedule_100e.py b/configs/detection/vitdet/vitdet_schedule_100e.py index e659b1f6..a9160eba 100644 --- a/configs/detection/vitdet/vitdet_schedule_100e.py +++ b/configs/detection/vitdet/vitdet_schedule_100e.py @@ -1,26 +1,29 @@ _base_ = 'configs/base.py' +log_config = dict( + interval=200, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + checkpoint_config = dict(interval=10) + # optimizer -paramwise_options = { - 'norm': dict(weight_decay=0.), - 'bias': dict(weight_decay=0.), - 'pos_embed': dict(weight_decay=0.), - 'cls_token': dict(weight_decay=0.) -} optimizer = dict( type='AdamW', lr=1e-4, betas=(0.9, 0.999), weight_decay=0.1, - paramwise_options=paramwise_options) -optimizer_config = dict(grad_clip=None, loss_scale=512.) + constructor='LayerDecayOptimizerConstructor', + paramwise_options=dict(num_layers=12, layer_decay_rate=0.7)) +optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=250, - warmup_ratio=0.067, + warmup_ratio=0.001, step=[88, 96]) total_epochs = 100 diff --git a/docs/source/_static/result.jpg b/docs/source/_static/result.jpg index 5bb73d81..d63bad1d 100644 --- a/docs/source/_static/result.jpg +++ b/docs/source/_static/result.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee64c0caef841c61c7e6344b7fe2c07a38fba07a8de81ff38c0686c641e0a283 -size 190356 +oid sha256:c696a58a2963b5ac47317751f04ff45bfed4723f2f70bacf91eac711f9710e54 +size 189432 diff --git a/docs/source/api/easycv.models.backbones.rst b/docs/source/api/easycv.models.backbones.rst index 3f1ecfd0..4c742c37 100644 --- a/docs/source/api/easycv.models.backbones.rst +++ b/docs/source/api/easycv.models.backbones.rst @@ -156,7 +156,7 @@ easycv.models.backbones.swin\_transformer\_dynamic module easycv.models.backbones.vit\_transfomer\_dynamic module ------------------------------------------------------- -.. automodule:: easycv.models.backbones.vit_transfomer_dynamic +.. automodule:: easycv.models.backbones.vit_transformer_dynamic :members: :undoc-members: :show-inheritance: diff --git a/docs/source/model_zoo_cls.md b/docs/source/model_zoo_cls.md index 3d91275e..a2254ddf 100644 --- a/docs/source/model_zoo_cls.md +++ b/docs/source/model_zoo_cls.md @@ -21,6 +21,9 @@ | hrnetw64 | [hrnetw64](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/hrnet/imagenet_hrnetw64_jpg.py) | 79.884 | 95.04 | 5120 | 54.74 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/resnet/hrnetw64/epoch_100.pth) | | vit-base-patch16 | [vit-base-patch16](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_vit_base_patch16_224_jpg.py) | 76.082 | 92.026 | 346 | 8.03 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/vit/vit-base-patch16/epoch_300.pth) | | swin-tiny-patch4-window7 | [swin-tiny-patch4-window7](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/swint/imagenet_swin_tiny_patch4_window7_224_jpg.py) | 80.528 | 94.822 | 132 | 12.94 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/swint/swin-tiny-patch4-window7/epoch_300.pth) | +| deitiii-small-patch16-224 | [deitiii-small-patch16-224](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_small_patch16_224_jpg.py) | 81.408 | 95.388 | 89 | 4.53 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_small_patch16_224/deitiii_small.pth) | +| deitiii-base-patch16-192 | [deitiii-base-patch16-192](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_base_patch16_192_jpg.py) | 82.982 | 95.95 | 337 | 4.63 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_base_patch16_192/deitiii_base.pth) | +| deitiii-large-patch16-192 | [deitiii-large-patch16-192](https://github.com/alibaba/EasyCV/tree/master/configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py) | 83.902 | 96.296 | 1170 | 10.17 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/imagenet_deitiii_large_patch16_192/deitiii_large.pth) | (ps: 通过EasyCV训练得到模型结果,推理的输入尺寸默认为224,机器默认为V100 16G,其中gpu memory记录的是gpu peak memory) diff --git a/docs/source/model_zoo_det.md b/docs/source/model_zoo_det.md index 03eb3588..ccabed31 100644 --- a/docs/source/model_zoo_det.md +++ b/docs/source/model_zoo_det.md @@ -6,38 +6,37 @@ Inference default use V100 16G. Pretrained on COCO2017 dataset. (The result has been optimized with PAI-Blade, and only computes the model inference time. To learn about end2end inference time, you can refer to [export.md](./tutorials/export.md).) -| Algorithm | Config | Params | SpeedV100
fp16 b32 | mAPval
0.5:0.95 | APval
50 | Download | -|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|--------|-----------------------------------------|-------------------------------------|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| YOLOX-s | [yolox_s_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_s_8xb16_300e_coco.py) | 9M | 0.68ms | 40.0 | 58.9 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/log.txt) | -| PAI-YOLOXs | [yoloxs_pai_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_8xb16_300e_coco.py) | 16M | 0.71ms | 41.4 | 60.0 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs.json) | -| PAI-YOLOXs-ASFF | [yoloxs_pai_asff_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_8xb16_300e_coco.py) | 21M | 0.87ms | 42.8 | 61.8 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff.json) | +| Algorithm | Config | Params | SpeedV100
fp16 b32 | mAPval
0.5:0.95 | APval
50 | Download | +| --------------------- | ------------------------------------------------------------ | ------ | --------------------------------------- | ----------------------------------- | ---------------------------- | ------------------------------------------------------------ | +| YOLOX-s | [yolox_s_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_s_8xb16_300e_coco.py) | 9M | 0.68ms | 40.0 | 58.9 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_s_bs16_lr002/log.txt) | +| PAI-YOLOXs | [yoloxs_pai_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_8xb16_300e_coco.py) | 16M | 0.71ms | 41.4 | 60.0 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs.json) | +| PAI-YOLOXs-ASFF | [yoloxs_pai_asff_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_8xb16_300e_coco.py) | 21M | 0.87ms | 42.8 | 61.8 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff.json) | | PAI-YOLOXs-ASFF-TOOD3 | [yoloxs_pai_asff_tood3_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/pai_yoloxs_asff_tood3_8xb16_300e_coco.py) | 24M | 1.15ms | 43.9 | 62.1 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/model/pai_yoloxs_asff_tood3.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox-pai/log/pai_yoloxs_asff_tood3.json) | -| YOLOX-m | [yolox_m_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb16_300e_coco.py) | 25M | 1.52ms | 46.3 | 64.9 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/log.txt) | -| YOLOX-l | [yolox_l_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb8_300e_coco.py) | 54M | 2.47ms | 48.9 | 67.5 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/log.txt) | -| YOLOX-x | [yolox_x_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_x_8xb8_300e_coco.py) | 99M | 4.74ms | 50.9 | 69.2 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/log.txt) | -| YOLOX-tiny | [yolox_tiny_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 5M | 0.28ms | 31.5 | 49.2 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/log.txt) | -| YOLOX-nano | [yolox_nano_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 2.2M | 0.19ms | 26.5 | 42.6 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/log.txt) | +| YOLOX-m | [yolox_m_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb16_300e_coco.py) | 25M | 1.52ms | 46.3 | 64.9 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_m_bs16_lr002/log.txt) | +| YOLOX-l | [yolox_l_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_m_8xb8_300e_coco.py) | 54M | 2.47ms | 48.9 | 67.5 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_l_bs8_lr001/log.txt) | +| YOLOX-x | [yolox_x_8xb8_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_x_8xb8_300e_coco.py) | 99M | 4.74ms | 50.9 | 69.2 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/epoch_290.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_x_bs8_lr001/log.txt) | +| YOLOX-tiny | [yolox_tiny_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 5M | 0.28ms | 31.5 | 49.2 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_tiny_bs16_lr002/log.txt) | +| YOLOX-nano | [yolox_nano_8xb16_300e_coco](https://github.com/alibaba/EasyCV/tree/master/configs/detection/yolox/yolox_tiny_8xb16_300e_coco.py) | 2.2M | 0.19ms | 26.5 | 42.6 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/yolox/yolox_nano_bs16_lr002/log.txt) | ## ViTDet - -| Algorithm | Config | Params
(backbone/total) | inference time(V100)
(ms/img) | bbox_mAPval
0.5:0.95 | mask_mAPval
0.5:0.95 | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| ViTDet_MaskRCNN | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_100e.py) | 88M/118M | 163ms | 50.57 | 44.96 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.log.json) | +| Algorithm | Config | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | bbox_mAPval
0.5:0.95 | mask_mAPval
0.5:0.95 | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| ViTDet_MaskRCNN | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_mask_rcnn_100e.py) | 86M/111M | 13.3 (fp16) | 138ms | 50.65 | 45.41 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/20220901_135827.log.json) | ## FCOS -| Algorithm | Config | Params
(backbone/total) | inference time(V100)
(ms/img) | mAPval
0.5:0.95 | APval
50 | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| FCOS-r50(caffe) | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_caffe_1x_coco.py) | 23M/32M | 85.8ms | 38.58 | 57.18 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) | -| FCOS-r50(torch) | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_torch_1x_coco.py) | 23M/32M | 105.3ms | 38.88 | 58.01 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/fcos_epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220826_182628.log.json) | +| Algorithm | Config | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | mAPval
0.5:0.95 | APval
50 | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| FCOS-r50(caffe) | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_caffe_1x_coco.py) | 23M/32M | 5.0 | 85.8ms | 38.58 | 57.18 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) | +| FCOS-r50(torch) | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_r50_torch_1x_coco.py) | 23M/32M | 4.0 (fp16) | 105.3ms | 38.88 | 58.01 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/fcos_epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220826_182628.log.json) | ## DETR -| Algorithm | Config | Params
(backbone/total) | inference time(V100)
(ms/img) | bbox_mAPval
0.5:0.95 | APval
50 | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| DETR-r50 | [detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/detr/detr_r50_8x2_150e_coco.py) | 23M/41M | 48.5ms | 39.92 | 60.52 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/epoch_150.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/20220609_101243.log.json) | -| DAB-DETR-r50 | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py) | 23M/43M | 58.5ms | 42.52 | 63.03 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/dab_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/20220610_122811.log.json) | -| DN-DETR-r50 | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dn_detr_r50_8x2_50e_coco.py) | 23M/43M | 58.5ms | 44.39 | 64.66 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/dn_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/20220713_105127.log.json) | +| Algorithm | Config | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | bbox_mAPval
0.5:0.95 | APval
50 | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| DETR-r50 | [detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/detr/detr_r50_8x2_150e_coco.py) | 23M/41M | 8.5 | 48.5ms | 39.92 | 60.52 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/epoch_150.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/detr/20220609_101243.log.json) | +| DAB-DETR-r50 | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py) | 23M/43M | 2.6 | 58.5ms | 42.52 | 63.03 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/dab_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dab_detr/20220610_122811.log.json) | +| DN-DETR-r50 | [dab-detr-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dab_detr/dn_detr_r50_8x2_50e_coco.py) | 23M/43M | 7.8 | 58.5ms | 44.39 | 64.66 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/dn_detr_epoch_50.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dn_detr/20220713_105127.log.json) | ## DINO diff --git a/docs/source/model_zoo_seg.md b/docs/source/model_zoo_seg.md index 97820feb..a1cdbb82 100644 --- a/docs/source/model_zoo_seg.md +++ b/docs/source/model_zoo_seg.md @@ -4,29 +4,29 @@ Pretrained on **Pascal VOC 2012 + Aug**. -| Algorithm | Config | Params
(backbone/total) | inference time(V100)
(ms/img) | mIoU | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| fcn_r50_d8 | [fcn_r50-d8_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/fcn/fcn_r50-d8_512x512_8xb4_60e_voc12aug.py) | 23M/49M | 166ms | 69.01 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/20220525_203606.log.json) | +| Algorithm | Config | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | mIoU | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| fcn_r50_d8 | [fcn_r50-d8_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/fcn/fcn_r50-d8_512x512_8xb4_60e_voc12aug.py) | 23M/49M | 19.8 | 166ms | 69.01 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/fcn_r50/20220525_203606.log.json) | ## UperNet Pretrained on **Pascal VOC 2012 + Aug**. - -| Algorithm | Config | Params
(backbone/total) | inference time(V100)
(ms/img) | mIoU | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 282.9ms | 76.59 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) | +| Algorithm | Config | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | mIoU | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 5.5 | 282.9ms | 76.59 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) | ## Mask2former ### Instance Segmentation on COCO -| Algorithm | Config | box MAP | Mask mAP | Download | -| ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- | -| mask2former_r50 | [mask2former_r50_8xb2_e50_instance](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_instance.py) | 46.09 | 43.26 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/20220620_113639.log.json) | +| Algorithm | Config | Train memory
(GB) | box MAP | Mask mAP | Download | +| ---------- | ------------------------------------------------------------ |----------|----------|----------|----------| +| mask2former_r50 | [mask2former_r50_8xb2_e50_instance](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_instance.py) | 18.8 | 46.09 | 43.26 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_instance/20220620_113639.log.json) | ### Panoptic Segmentation on COCO -| Algorithm | Config | PQ | box MAP | Mask mAP | Download | -| ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- | -| mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) | + +| Algorithm | Config | Train memory
(GB) | PQ | box MAP | Mask mAP | Download | +| ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |---------------------------------------------------------------------------- | +| mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 18.8 | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) | ## SegFormer diff --git a/docs/source/model_zoo_ssl.md b/docs/source/model_zoo_ssl.md index 81650e43..c6a1ef30 100644 --- a/docs/source/model_zoo_ssl.md +++ b/docs/source/model_zoo_ssl.md @@ -5,19 +5,19 @@ Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | Flops | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | -------- | -------------------------- | ----- | -------------------------------- | ------ | ------------------------------------------------------------ | -| [mae_vit_base_patch16_8xb64_400e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_400e.py) | ViT-B/16 | 85M/111M | 9.8G | 8.03 | 400 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-400/pretrain_400.pth) | -| [mae_vit_base_patch16_8xb64_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_1600e.py) | ViT-B/16 | 85M/111M | 9.8G | 8.03 | 1600 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/pretrain_1600.pth) | -| [mae_vit_large_patch16_8xb32_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_large_patch16_8xb32_1600e.py) | ViT-L/16 | 303M/329M | 20.8G | 16.30 | 1600 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-l-1600/pretrain_1600.pth) | +| Config | Backbone | Params
(backbone/total) | Train memory
(GB) | Flops | inference time(V100)
(ms/img) | Epochs | Download | +| ------------------------------------------------------------ | -------- | -------------------------- | ------------------ | ----- | -------------------------------- | ------ | ------------------------------------------------------------ | +| [mae_vit_base_patch16_8xb64_400e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_400e.py) | ViT-B/16 | 85M/111M | 9.5 | 9.8G | 8.03 | 400 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-400/pretrain_400.pth) | +| [mae_vit_base_patch16_8xb64_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_base_patch16_8xb64_1600e.py) | ViT-B/16 | 85M/111M | 9.5 | 9.8G | 8.03 | 1600 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/pretrain_1600.pth) | +| [mae_vit_large_patch16_8xb32_1600e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mae/mae_vit_large_patch16_8xb32_1600e.py) | ViT-L/16 | 303M/329M | 11.3 | 20.8G | 16.30 | 1600 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-l-1600/pretrain_1600.pth) | ### Fast ConvMAE Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | Flops | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | --------------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ | -| [fast_convmae_vit_base_patch16_8xb64_50e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/fast_convmae/fast_convmae_vit_base_patch16_8xb64_50e.py) | ConvViT-B/16 | 88M/115M | 45.1G | 6.88 | 50 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/20220617_110501.log.json) | +| Config | Backbone | Params
(backbone/total) | Train memory
(GB) | Flops | inference time(V100)
(ms/img) | Total train time | Epochs | Download | +| ------------------------------------------------------------ | --------------- | --------------------------- | ----- | --------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| [fast_convmae_vit_base_patch16_8xb64_50e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/fast_convmae/fast_convmae_vit_base_patch16_8xb64_50e.py) | ConvViT-B/16 | 88M/115M | 30.3 | 45.1G | 6.88 | 20h
(8*A100) | 50 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/FastConvMAE/pretrained/20220617_110501.log.json) | > The flops of Fast ConvMAE is about four times of MAE, because the mask of MAE only retains 25% of the tokens each forward, but the mask of Fast ConvMAE adopts a complementary strategy, dividing the mask into four complementary parts with 25% token each part. This is equivalent to learning four samples at each forward, achieving 4 times the learning effect. @@ -25,34 +25,34 @@ Pretrained on **ImageNet** dataset. Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | --------- | --------------------------- | --------------------------------- | ------ | ------------------------------------------------------------ | -| [dino_deit_small_p16_8xb32_100e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/dino/dino_deit_small_p16_8xb32_100e_tfrecord.py) | DeiT-S/16 | 21M/88M | 6.17 | 100 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/dino_deit_small/epoch_100.pth) | +| Config | Backbone | Params
(backbone/total) | Train memory
(GB) | inference time(V100)
(ms/img) | Epochs | Download | +| ------------------------------------------------------------ | --------- | --------------------------- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ | +| [dino_deit_small_p16_8xb32_100e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/dino/dino_deit_small_p16_8xb32_100e_tfrecord.py) | DeiT-S/16 | 21M/88M | 10.5 | 6.17 | 100 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/dino_deit_small/epoch_100.pth) | ### MoBY Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | Flops | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | --------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ | -| [moby_deit_small_p16_4xb128_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_deit_small_p16_4xb128_300e_tfrecord.py) | DeiT-S/16 | 21M/26M | 18.6G | 6.17 | 300 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/log.txt) | -| [moby_swin_tiny_8xb64_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_dynamic_swin_tiny_8xb64_300e_tfrecord.py) | Swin-T | 27M/33M | 18.1G | 9.74 | 300 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/log.txt) | +| Config | Backbone | Params
(backbone/total) | Flops | Train memory
(GB) | inference time(V100)
(ms/img) | Epochs | Download | +| ------------------------------------------------------------ | --------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ | +| [moby_deit_small_p16_4xb128_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_deit_small_p16_4xb128_300e_tfrecord.py) | DeiT-S/16 | 21M/26M | 18.6G | 21.4 | 6.17 | 300 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_deit_small_p16/log.txt) | +| [moby_swin_tiny_8xb64_300e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/moby/moby_dynamic_swin_tiny_8xb64_300e_tfrecord.py) | Swin-T | 27M/33M | 18.1G | 16.1 | 9.74 | 300 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/epoch_300.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/moby_dynamic_swin_tiny/log.txt) | ### MoCo V2 Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | Flops | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | -------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ | -| [mocov2_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mocov2/mocov2_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M | 8.2G | 8.59 | 200 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mocov2_r50/epoch_200.pth) | +| Config | Backbone | Params
(backbone/total) | Flops | Train memory
(GB) | inference time(V100)
(ms/img) | Epochs | Download | +| ------------------------------------------------------------ | -------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ | +| [mocov2_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/mocov2/mocov2_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M | 8.2G | 5.4 | 8.59 | 200 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mocov2_r50/epoch_200.pth) | ### SwAV Pretrained on **ImageNet** dataset. -| Config | Backbone | Params
(backbone/total) | Flops | inference time(V100)
(ms/img) | Epochs | Download | -| ------------------------------------------------------------ | -------- | --------------------------- | ----- | --------------------------------- | ------ | ------------------------------------------------------------ | -| [swav_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/swav/swav_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M | 12.9G | 8.59 | 200 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/epoch_200.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/log.txt) | +| Config | Backbone | Params
(backbone/total) | Flops | Train memory
(GB) | inference time(V100)
(ms/img) | Epochs | Download | +| ------------------------------------------------------------ | -------- | --------------------------- | ----- | ------------------ | --------------------------------- | ------ | ------------------------------------------------------------ | +| [swav_resnet50_8xb32_200e](https://github.com/alibaba/EasyCV/tree/master/configs/selfsup/swav/swav_rn50_8xb32_200e_tfrecord.py) | ResNet50 | 23M/28M | 12.9G | 11.3 | 8.59 | 200 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/epoch_200.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/swav_r50/log.txt) | ## Benchmarks diff --git a/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb b/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb index 71024860..f96f0b52 100644 --- a/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb +++ b/docs/source/tutorials/EasyCV图像自监督训练-MAE.ipynb @@ -262,7 +262,7 @@ "state_dict = torch.load(weight_path)['state_dict']\n", "state_dict_out = {}\n", "for key in state_dict:\n", - " state_dict_out[key.replace('encoder.','')] = state_dict[key]\n", + " state_dict_out['model.' + key.replace('encoder.','')] = state_dict[key]\n", "torch.save(state_dict_out,weight_path)" ] }, @@ -324,7 +324,7 @@ "outputs": [], "source": [ "!python -m torch.distributed.launch --nproc_per_node=1 --master_port=29930 \\\n", - "/home/pai/lib/python3.6/site-packages/easycv/tools/train.py mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py --work_dir work_dir/selfsup/jpg/mae --launcher pytorch" + "/home/pai/lib/python3.6/site-packages/easycv/tools/train.py mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py --work_dir work_dir/selfsup/jpg/mae_fintune --launcher pytorch" ] }, { @@ -333,7 +333,56 @@ "metadata": {}, "source": [ "### 预测\n", - "参考EasyCV图像分类的demo,对训练好的模型导出并预测" + "对训练好的模型导出并预测" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4271c852", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m easycv.tools.export mae_vit_base_patch16_8xb64_100e_lrdecay065_fintune.py work_dir/selfsup/jpg/mae_fintune/ClsEvaluator_neck_top1_best.pth work_dir/selfsup/jpg/mae_fintune/best_export.pth" + ] + }, + { + "cell_type": "markdown", + "id": "2cc9e6fc", + "metadata": {}, + "source": [ + "下载测试图片和标签文件" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "973d5bd4", + "metadata": {}, + "outputs": [], + "source": [ + "! wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/cifar10/qince_data/predict/aeroplane_s_000004.png\n", + "! wget http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/doc/easycv/configs/selfsup/mae/label_map.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a5a3632", + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "from easycv.predictors.classifier import TorchClassifier\n", + "\n", + "output_ckpt = 'work_dir/selfsup/jpg/mae_fintune/best_export.pth'\n", + "tcls = TorchClassifier(output_ckpt, topk=1, label_map_path='label_map.txt')\n", + "\n", + "img = cv2.imread('aeroplane_s_000004.png')\n", + "# input image should be RGB order\n", + "img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n", + "output = tcls.predict([img])\n", + "print(output)" ] } ], diff --git a/easycv/apis/export.py b/easycv/apis/export.py index fe8a1850..9a0b9165 100644 --- a/easycv/apis/export.py +++ b/easycv/apis/export.py @@ -13,6 +13,7 @@ import torchvision.transforms.functional as t_f from mmcv.utils import Config from easycv.file import io +from easycv.framework.errors import ValueError from easycv.models import (DINO, MOCO, SWAV, YOLOX, Classification, MoBY, build_model) from easycv.utils.checkpoint import load_checkpoint @@ -106,6 +107,9 @@ def _export_cls(model, cfg, filename): backbone=replace_syncbn(cfg.model.backbone), ) + # avoid load pretrained model + model_config['pretrained'] = False + if export_neck: if hasattr(cfg.model, 'neck'): model_config['neck'] = cfg.model.neck diff --git a/easycv/apis/test.py b/easycv/apis/test.py index 7d3e3dda..d27a0291 100644 --- a/easycv/apis/test.py +++ b/easycv/apis/test.py @@ -15,6 +15,7 @@ from mmcv.parallel import (MMDataParallel, MMDistributedDataParallel, from mmcv.runner import get_dist_info from easycv.file import io +from easycv.framework.errors import ValueError from easycv.utils.torchacc_util import is_torchacc_enabled diff --git a/easycv/core/evaluation/coco_evaluation.py b/easycv/core/evaluation/coco_evaluation.py index fe5cc075..63891626 100644 --- a/easycv/core/evaluation/coco_evaluation.py +++ b/easycv/core/evaluation/coco_evaluation.py @@ -31,6 +31,7 @@ from easycv.core import standard_fields from easycv.core.evaluation import coco_tools from easycv.core.post_processing.nms import oks_nms, soft_oks_nms from easycv.core.standard_fields import DetectionResultFields, InputDataFields +from easycv.framework.errors import KeyError, TypeError, ValueError from easycv.utils.json_utils import MyEncoder from .base_evaluator import Evaluator from .builder import EVALUATORS @@ -365,7 +366,7 @@ class CocoDetectionEvaluator(Evaluator): def _check_mask_type_and_value(array_name, masks): """Checks whether mask dtype is uint8 and the values are either 0 or 1.""" if masks.dtype != np.uint8: - raise ValueError('{} must be of type np.uint8. Found {}.'.format( + raise TypeError('{} must be of type np.uint8. Found {}.'.format( array_name, masks.dtype)) if np.any(np.logical_and(masks != 0, masks != 1)): raise ValueError( diff --git a/easycv/core/evaluation/keypoint_eval.py b/easycv/core/evaluation/keypoint_eval.py index 0549a71f..4ab4f0c6 100644 --- a/easycv/core/evaluation/keypoint_eval.py +++ b/easycv/core/evaluation/keypoint_eval.py @@ -3,6 +3,7 @@ # https://github.com/open-mmlab/mmpose/blob/master/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py import numpy as np +from easycv.framework.errors import KeyError from .base_evaluator import Evaluator from .builder import EVALUATORS from .metric_registry import METRICS diff --git a/easycv/core/evaluation/metric_registry.py b/easycv/core/evaluation/metric_registry.py index 35f89f5d..5c2f3e0e 100644 --- a/easycv/core/evaluation/metric_registry.py +++ b/easycv/core/evaluation/metric_registry.py @@ -1,6 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import inspect +from easycv.framework.errors import KeyError, TypeError + class MetricRegistry(object): diff --git a/easycv/core/evaluation/segmentation_eval.py b/easycv/core/evaluation/segmentation_eval.py index d76d4d66..81cbd82f 100644 --- a/easycv/core/evaluation/segmentation_eval.py +++ b/easycv/core/evaluation/segmentation_eval.py @@ -5,6 +5,7 @@ import numpy as np import torch from prettytable import PrettyTable +from easycv.framework.errors import KeyError from easycv.utils.logger import print_log from .base_evaluator import Evaluator from .builder import EVALUATORS diff --git a/easycv/core/evaluation/top_down_eval.py b/easycv/core/evaluation/top_down_eval.py index ebb505e8..47a4f2dc 100644 --- a/easycv/core/evaluation/top_down_eval.py +++ b/easycv/core/evaluation/top_down_eval.py @@ -6,6 +6,7 @@ import cv2 import numpy as np from easycv.core.post_processing import transform_preds +from easycv.framework.errors import ValueError def _calc_distances(preds, targets, mask, normalize): diff --git a/easycv/core/optimizer/__init__.py b/easycv/core/optimizer/__init__.py index b4df330b..4c2bf30d 100644 --- a/easycv/core/optimizer/__init__.py +++ b/easycv/core/optimizer/__init__.py @@ -4,6 +4,7 @@ import torch from torch.optim import * from .builder import build_optimizer_constructor +from .lamb import Lamb from .lars import LARS from .layer_decay_optimizer_constructor import LayerDecayOptimizerConstructor from .ranger import Ranger diff --git a/easycv/core/optimizer/adam.py b/easycv/core/optimizer/adam.py index e015d523..f1bebd9d 100644 --- a/easycv/core/optimizer/adam.py +++ b/easycv/core/optimizer/adam.py @@ -8,6 +8,8 @@ from mmcv.runner.optimizer.builder import OPTIMIZERS from torch import Tensor from torch.optim import AdamW as _AdamW +from easycv.framework.errors import RuntimeError + def adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor], max_exp_avg_sqs: List[Tensor], diff --git a/easycv/core/optimizer/lamb.py b/easycv/core/optimizer/lamb.py new file mode 100644 index 00000000..92a296fa --- /dev/null +++ b/easycv/core/optimizer/lamb.py @@ -0,0 +1,168 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import math + +import torch +from mmcv.runner import OPTIMIZERS +from torch.optim import Optimizer + +from easycv.framework.errors import RuntimeError + + +@OPTIMIZERS.register_module() +class Lamb(Optimizer): + """A pure pytorch variant of FuseLAMB (NvLamb variant) optimizer. + This class is copied from `timm`_. The LAMB was proposed in `Large Batch + Optimization for Deep Learning - Training BERT in 76 minutes`_. + .. _timm: + https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/lamb.py + .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes: + https://arxiv.org/abs/1904.00962 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging (bool, optional): whether apply (1-beta2) to grad when + calculating running averages of gradient. (default: True) + max_grad_norm (float, optional): value used to clip global grad norm + (default: 1.0) + trust_clip (bool): enable LAMBC trust ratio clipping (default: False) + always_adapt (boolean, optional): Apply adaptive learning rate to 0.0 + weight decay parameter (default: False) + """ # noqa: E501 + + def __init__(self, + params, + lr=1e-3, + bias_correction=True, + betas=(0.9, 0.999), + eps=1e-6, + weight_decay=0.01, + grad_averaging=True, + max_grad_norm=1.0, + trust_clip=False, + always_adapt=False): + defaults = dict( + lr=lr, + bias_correction=bias_correction, + betas=betas, + eps=eps, + weight_decay=weight_decay, + grad_averaging=grad_averaging, + max_grad_norm=max_grad_norm, + trust_clip=trust_clip, + always_adapt=always_adapt) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + device = self.param_groups[0]['params'][0].device + one_tensor = torch.tensor( + 1.0, device=device + ) # because torch.where doesn't handle scalars correctly + global_grad_norm = torch.zeros(1, device=device) + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.is_sparse: + raise RuntimeError( + 'Lamb does not support sparse gradients, consider ' + 'SparseAdam instead.') + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + # FIXME it'd be nice to remove explicit tensor conversion of scalars + # when torch.where promotes + # scalar types properly https://github.com/pytorch/pytorch/issues/9190 + max_grad_norm = torch.tensor( + self.defaults['max_grad_norm'], device=device) + clip_global_grad_norm = torch.where(global_grad_norm > max_grad_norm, + global_grad_norm / max_grad_norm, + one_tensor) + + for group in self.param_groups: + bias_correction = 1 if group['bias_correction'] else 0 + beta1, beta2 = group['betas'] + grad_averaging = 1 if group['grad_averaging'] else 0 + beta3 = 1 - beta1 if grad_averaging else 1.0 + + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or + # pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + if bias_correction: + bias_correction1 = 1 - beta1**group['step'] + bias_correction2 = 1 - beta2**group['step'] + else: + bias_correction1, bias_correction2 = 1.0, 1.0 + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.div_(clip_global_grad_norm) + state = self.state[p] + + # State initialization + if len(state) == 0: + # Exponential moving average of gradient valuesa + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=beta3) # m_t + exp_avg_sq.mul_(beta2).addcmul_( + grad, grad, value=1 - beta2) # v_t + + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_( + group['eps']) + update = (exp_avg / bias_correction1).div_(denom) + + weight_decay = group['weight_decay'] + if weight_decay != 0: + update.add_(p, alpha=weight_decay) + + if weight_decay != 0 or group['always_adapt']: + # Layer-wise LR adaptation. By default, skip adaptation on + # parameters that are + # excluded from weight decay, unless always_adapt == True, + # then always enabled. + w_norm = p.norm(2.0) + g_norm = update.norm(2.0) + # FIXME nested where required since logical and/or not + # working in PT XLA + trust_ratio = torch.where( + w_norm > 0, + torch.where(g_norm > 0, w_norm / g_norm, one_tensor), + one_tensor, + ) + if group['trust_clip']: + # LAMBC trust clipping, upper bound fixed at one + trust_ratio = torch.minimum(trust_ratio, one_tensor) + update.mul_(trust_ratio) + + p.add_(update, alpha=-group['lr']) + + return loss diff --git a/easycv/core/optimizer/lars.py b/easycv/core/optimizer/lars.py index f6700bce..07d7d5f6 100644 --- a/easycv/core/optimizer/lars.py +++ b/easycv/core/optimizer/lars.py @@ -3,6 +3,8 @@ import torch from torch.optim import * # noqa: F401,F403 from torch.optim.optimizer import Optimizer, required +from easycv.framework.errors import ValueError + class LARS(Optimizer): r"""Implements layer-wise adaptive rate scaling for SGD. diff --git a/easycv/core/optimizer/layer_decay_optimizer_constructor.py b/easycv/core/optimizer/layer_decay_optimizer_constructor.py index 45625494..310bb38c 100644 --- a/easycv/core/optimizer/layer_decay_optimizer_constructor.py +++ b/easycv/core/optimizer/layer_decay_optimizer_constructor.py @@ -1,5 +1,3 @@ -# Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py - import json from mmcv.runner import DefaultOptimizerConstructor, get_dist_info @@ -7,23 +5,32 @@ from mmcv.runner import DefaultOptimizerConstructor, get_dist_info from .builder import OPTIMIZER_BUILDERS -def get_num_layer_for_vit(var_name, num_max_layer, layer_sep=None): - if var_name in ('backbone.cls_token', 'backbone.mask_token', - 'backbone.pos_embed'): - return 0 - elif var_name.startswith('backbone.patch_embed'): - return 0 - elif var_name.startswith('backbone.blocks'): - layer_id = int(var_name.split('.')[2]) - return layer_id + 1 - else: - return num_max_layer - 1 +def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): + """ + Calculate lr decay rate for different ViT blocks. + Reference from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py + Args: + name (string): parameter name. + lr_decay_rate (float): base lr decay rate. + num_layers (int): number of ViT blocks. + Returns: + lr decay rate for the given parameter. + """ + layer_id = num_layers + 1 + if '.pos_embed' in name or '.patch_embed' in name: + layer_id = 0 + elif '.blocks.' in name and '.residual.' not in name: + layer_id = int(name[name.find('.blocks.'):].split('.')[2]) + 1 + + scale = lr_decay_rate**(num_layers + 1 - layer_id) + + return layer_id, scale @OPTIMIZER_BUILDERS.register_module() class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor): - def add_params(self, params, module, prefix='', is_dcn_module=None): + def add_params(self, params, module): """Add all parameters of module to the params list. The parameters of the given module will be added to the list of param groups, with specific rules defined by paramwise_cfg. @@ -31,54 +38,41 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor): params (list[dict]): A list of param groups, it will be modified in place. module (nn.Module): The module to be added. - prefix (str): The prefix of the module - is_dcn_module (int|float|None): If the current module is a - submodule of DCN, `is_dcn_module` will be passed to - control conv_offset layer's learning rate. Defaults to None. + + Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py + Note: Currently, this optimizer constructor is built for ViTDet. """ - # get param-wise options parameter_groups = {} print(self.paramwise_cfg) - num_layers = self.paramwise_cfg.get('num_layers') + 2 - layer_sep = self.paramwise_cfg.get('layer_sep', None) - layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate') + lr_decay_rate = self.paramwise_cfg.get('layer_decay_rate') + num_layers = self.paramwise_cfg.get('num_layers') print('Build LayerDecayOptimizerConstructor %f - %d' % - (layer_decay_rate, num_layers)) + (lr_decay_rate, num_layers)) + lr = self.base_lr weight_decay = self.base_wd - custom_keys = self.paramwise_cfg.get('custom_keys', {}) - # first sort with alphabet order and then sort with reversed len of str - sorted_keys = sorted(custom_keys.keys()) - for name, param in module.named_parameters(): if not param.requires_grad: continue # frozen weights - if len(param.shape) == 1 or name.endswith('.bias') or ( - 'pos_embed' in name) or ('cls_token' - in name) or ('rel_pos_' in name): + if 'backbone' in name and ('.norm' in name or '.pos_embed' in name + or '.gn.' in name or '.ln.' in name): group_name = 'no_decay' this_weight_decay = 0. else: group_name = 'decay' this_weight_decay = weight_decay - layer_id = get_num_layer_for_vit(name, num_layers, layer_sep) + if name.startswith('backbone'): + layer_id, scale = get_vit_lr_decay_rate( + name, lr_decay_rate=lr_decay_rate, num_layers=num_layers) + else: + layer_id, scale = -1, 1 group_name = 'layer_%d_%s' % (layer_id, group_name) - # if the parameter match one of the custom keys, ignore other rules - this_lr_multi = 1. - for key in sorted_keys: - if key in f'{name}': - lr_mult = custom_keys[key].get('lr_mult', 1.) - this_lr_multi = lr_mult - group_name = '%s_%s' % (group_name, key) - break - if group_name not in parameter_groups: - scale = layer_decay_rate**(num_layers - layer_id - 1) parameter_groups[group_name] = { 'weight_decay': this_weight_decay, @@ -86,7 +80,7 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor): 'param_names': [], 'lr_scale': scale, 'group_name': group_name, - 'lr': scale * self.base_lr * this_lr_multi, + 'lr': scale * lr, } parameter_groups[group_name]['params'].append(param) diff --git a/easycv/core/optimizer/ranger.py b/easycv/core/optimizer/ranger.py index 727b6f0e..5ec04aac 100644 --- a/easycv/core/optimizer/ranger.py +++ b/easycv/core/optimizer/ranger.py @@ -4,6 +4,8 @@ import math import torch from torch.optim.optimizer import Optimizer +from easycv.framework.errors import ValueError + def centralized_gradient(x, use_gc=True, gc_conv_only=False): '''credit - https://github.com/Yonghongwei/Gradient-Centralization ''' diff --git a/easycv/core/sailfish/linear.py b/easycv/core/sailfish/linear.py index 6386dab6..939de5b4 100644 --- a/easycv/core/sailfish/linear.py +++ b/easycv/core/sailfish/linear.py @@ -22,6 +22,7 @@ import torch from easycv.core.sailfish.util import (BiasUniformInitializer, KaimingUniformInitializer, ModelParallel, RenormUniformInitializer) +from easycv.framework.errors import ValueError class Linear(torch.nn.Module): diff --git a/easycv/core/sailfish/util.py b/easycv/core/sailfish/util.py index 57155cbc..c54fd61d 100644 --- a/easycv/core/sailfish/util.py +++ b/easycv/core/sailfish/util.py @@ -25,6 +25,7 @@ from easycv.core.sailfish.function import (all_cat, all_log_softmax, shard_correct_predictions, shard_target_and_mask, shard_topk_correct_predictions) +from easycv.framework.errors import NotImplementedError, ValueError class DistributedParallel: diff --git a/easycv/core/visualization/image.py b/easycv/core/visualization/image.py index 3b61f50b..9c79341c 100644 --- a/easycv/core/visualization/image.py +++ b/easycv/core/visualization/image.py @@ -10,6 +10,8 @@ import numpy as np from mmcv.utils.misc import deprecated_api_warning from PIL import Image, ImageDraw, ImageFont +from easycv.framework.errors import FileNotFoundError + def get_font_path(): root_path = opd(opd(opd(os.path.realpath(__file__)))) @@ -22,8 +24,8 @@ def get_font_path(): elif os.path.exists(find_path_source): return find_path_source else: - raise ValueError('Not find font file both in %s and %s' % - (find_path_whl, find_path_source)) + raise FileNotFoundError('Not find font file both in %s and %s' % + (find_path_whl, find_path_source)) _FONT_PATH = get_font_path() diff --git a/easycv/datasets/classification/data_sources/image_list.py b/easycv/datasets/classification/data_sources/image_list.py index 9835daa7..e37f9fa8 100644 --- a/easycv/datasets/classification/data_sources/image_list.py +++ b/easycv/datasets/classification/data_sources/image_list.py @@ -7,6 +7,7 @@ from PIL import Image, ImageFile from easycv.datasets.registry import DATASOURCES from easycv.file import io +from easycv.framework.errors import TypeError from easycv.utils.dist_utils import dist_zero_exec from .utils import split_listfile_byrank @@ -54,8 +55,8 @@ class ClsSourceImageList(object): 'list_file should be str or list(str)' root = [root] if isinstance(root, str) else root if not isinstance(root, list): - raise ValueError('root must be str or list(str), but get %s' % - type(root)) + raise TypeError('root must be str or list(str), but get %s' % + type(root)) if len(root) < len(list_file): logging.warning( diff --git a/easycv/datasets/classification/odps.py b/easycv/datasets/classification/odps.py index 24e24006..e8bf62f5 100644 --- a/easycv/datasets/classification/odps.py +++ b/easycv/datasets/classification/odps.py @@ -3,6 +3,7 @@ from PIL import Image from easycv.datasets.registry import DATASETS from easycv.datasets.shared.base import BaseDataset +from easycv.framework.errors import NotImplementedError @DATASETS.register_module diff --git a/easycv/datasets/classification/pipelines/auto_augment.py b/easycv/datasets/classification/pipelines/auto_augment.py index 9f4137e6..84c8b3ae 100644 --- a/easycv/datasets/classification/pipelines/auto_augment.py +++ b/easycv/datasets/classification/pipelines/auto_augment.py @@ -8,10 +8,11 @@ from typing import Sequence import mmcv import numpy as np -from PIL import Image +from PIL import Image, ImageFilter from easycv.datasets.registry import PIPELINES from easycv.datasets.shared.pipelines import Compose +from easycv.framework.errors import TypeError # Default hyperparameters for all Ops _HPARAMS_DEFAULT = dict(pad_val=128) @@ -1043,3 +1044,37 @@ class Cutout(object): repr_str += f'pad_val={self.pad_val}, ' repr_str += f'prob={self.prob})' return repr_str + + +@PIPELINES.register_module() +class PILGaussianBlur(object): + + def __init__(self, prob=0.1, radius_min=0.1, radius_max=2.): + assert 0 <= prob <= 1.0, 'The prob should be in range [0,1], ' \ + f'got {prob} instead.' + assert isinstance(radius_min, (int, float)), 'The radius_min type must '\ + f'be int or float, but got {type(radius_min)} instead.' + assert isinstance(radius_max, (int, float)), 'The radius_max type must '\ + f'be int or float, but got {type(radius_max)} instead.' + + self.prob = prob + self.radius_min = radius_min + self.radius_max = radius_max + + def __call__(self, results): + if np.random.rand() > self.prob: + return results + + for key in results.get('img_fields', ['img']): + img = results[key].filter( + ImageFilter.GaussianBlur( + radius=random.uniform(self.radius_min, self.radius_max))) + results[key] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' + repr_str += f'radius_min={self.radius_min}, ' + repr_str += f'radius_max={self.radius_max})' + return repr_str diff --git a/easycv/datasets/detection/data_sources/base.py b/easycv/datasets/detection/data_sources/base.py index 60bf04a3..fb1677be 100644 --- a/easycv/datasets/detection/data_sources/base.py +++ b/easycv/datasets/detection/data_sources/base.py @@ -10,6 +10,7 @@ from mmcv.runner.dist_utils import get_dist_info from tqdm import tqdm from easycv.file.image import load_image +from easycv.framework.errors import NotImplementedError, ValueError def _load_image(img_path): diff --git a/easycv/datasets/detection/data_sources/coco.py b/easycv/datasets/detection/data_sources/coco.py index 1f1efca4..76709f32 100644 --- a/easycv/datasets/detection/data_sources/coco.py +++ b/easycv/datasets/detection/data_sources/coco.py @@ -4,6 +4,7 @@ from xtcocotools.coco import COCO from easycv.datasets.registry import DATASOURCES, PIPELINES from easycv.datasets.shared.pipelines import Compose +from easycv.framework.errors import TypeError from easycv.utils.registry import build_from_cfg diff --git a/easycv/datasets/detection/data_sources/coco_panoptic.py b/easycv/datasets/detection/data_sources/coco_panoptic.py index fcf2060a..9ee2ea96 100644 --- a/easycv/datasets/detection/data_sources/coco_panoptic.py +++ b/easycv/datasets/detection/data_sources/coco_panoptic.py @@ -8,6 +8,7 @@ from xtcocotools.coco import COCO from easycv.datasets.detection.data_sources import DetSourceCoco from easycv.datasets.registry import DATASOURCES, PIPELINES from easycv.datasets.shared.pipelines import Compose +from easycv.framework.errors import RuntimeError, TypeError from easycv.utils.registry import build_from_cfg try: diff --git a/easycv/datasets/detection/data_sources/pai_format.py b/easycv/datasets/detection/data_sources/pai_format.py index 6f390c86..8ad26e5a 100644 --- a/easycv/datasets/detection/data_sources/pai_format.py +++ b/easycv/datasets/detection/data_sources/pai_format.py @@ -8,6 +8,7 @@ import numpy as np from easycv.datasets.detection.data_sources.base import DetSourceBase from easycv.datasets.registry import DATASOURCES from easycv.file import io +from easycv.framework.errors import NotImplementedError, ValueError def get_prior_task_id(keys): diff --git a/easycv/datasets/detection/mix.py b/easycv/datasets/detection/mix.py index 6e7d203f..19394908 100644 --- a/easycv/datasets/detection/mix.py +++ b/easycv/datasets/detection/mix.py @@ -9,6 +9,7 @@ import numpy as np import torch from easycv.datasets.registry import DATASETS, PIPELINES +from easycv.framework.errors import TypeError from easycv.utils.bbox_util import xyxy2xywh as xyxy2cxcywh from easycv.utils.registry import build_from_cfg from .raw import DetDataset diff --git a/easycv/datasets/detection/pipelines/mm_transforms.py b/easycv/datasets/detection/pipelines/mm_transforms.py index cd4257cc..0c5846ec 100644 --- a/easycv/datasets/detection/pipelines/mm_transforms.py +++ b/easycv/datasets/detection/pipelines/mm_transforms.py @@ -13,6 +13,7 @@ from torchvision.transforms import functional as F from easycv.datasets.registry import PIPELINES from easycv.datasets.shared.pipelines.transforms import Compose +from easycv.framework.errors import KeyError, NotImplementedError, TypeError try: from panopticapi.utils import rgb2id @@ -1122,8 +1123,8 @@ class MMRandomFlip: elif flip_ratio is None: pass else: - raise ValueError('flip_ratios must be None, float, ' - 'or list of float') + raise TypeError('flip_ratios must be None, float, ' + 'or list of float') self.flip_ratio = flip_ratio valid_directions = ['horizontal', 'vertical', 'diagonal'] @@ -1133,7 +1134,7 @@ class MMRandomFlip: assert mmcv.is_list_of(direction, str) assert set(direction).issubset(set(valid_directions)) else: - raise ValueError('direction must be either str or list of str') + raise TypeError('direction must be either str or list of str') self.direction = direction if isinstance(flip_ratio, list): @@ -1168,7 +1169,7 @@ class MMRandomFlip: flipped[..., 2::4] = w - bboxes[..., 0::4] flipped[..., 3::4] = h - bboxes[..., 1::4] else: - raise ValueError(f"Invalid flipping direction '{direction}'") + raise KeyError(f"Invalid flipping direction '{direction}'") return flipped def __call__(self, results): @@ -1274,7 +1275,7 @@ class MMRandomCrop: if crop_type not in [ 'relative_range', 'relative', 'absolute', 'absolute_range' ]: - raise ValueError(f'Invalid crop_type {crop_type}.') + raise KeyError(f'Invalid crop_type {crop_type}.') if crop_type in ['absolute', 'absolute_range']: assert crop_size[0] > 0 and crop_size[1] > 0 assert isinstance(crop_size[0], int) and isinstance( diff --git a/easycv/datasets/detection/raw.py b/easycv/datasets/detection/raw.py index 49f6a3b1..3f6800a1 100644 --- a/easycv/datasets/detection/raw.py +++ b/easycv/datasets/detection/raw.py @@ -9,6 +9,7 @@ from easycv.datasets.detection.data_sources import DetSourceCoco from easycv.datasets.registry import DATASETS from easycv.datasets.shared.base import BaseDataset from easycv.file.image import load_image +from easycv.framework.errors import TimeoutError @DATASETS.register_module @@ -38,7 +39,7 @@ class DetDataset(BaseDataset): count = 0 while True: if count > 10: - raise RuntimeError('Loops timeout') + raise TimeoutError('Loops timeout') data_dict = self.data_source[idx] data_dict = self.pipeline(data_dict) if data_dict is None: diff --git a/easycv/datasets/loader/build_loader.py b/easycv/datasets/loader/build_loader.py index 6af50073..4977553b 100644 --- a/easycv/datasets/loader/build_loader.py +++ b/easycv/datasets/loader/build_loader.py @@ -11,10 +11,11 @@ from mmcv.runner import get_dist_info from torch.utils.data import DataLoader, RandomSampler from easycv.datasets.shared.odps_reader import set_dataloader_workid +from easycv.framework.errors import NotImplementedError from easycv.utils.dist_utils import sync_random_seed from easycv.utils.torchacc_util import is_torchacc_enabled from .collate import CollateWrapper -from .sampler import DistributedMPSampler, DistributedSampler +from .sampler import DistributedMPSampler, DistributedSampler, RASampler if platform.system() != 'Windows': # https://github.com/pytorch/pytorch/issues/973 @@ -35,6 +36,7 @@ def build_dataloader(dataset, odps_config=None, persistent_workers=False, collate_hooks=None, + use_repeated_augment_sampler=False, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. @@ -56,6 +58,8 @@ def build_dataloader(dataset, data in worker process can be reused. persistent_workers (bool) : After pytorch1.7, could use persistent_workers=True to avoid reconstruct dataworker before each epoch, speed up before epoch + use_repeated_augment_sampler (bool) : If set true, it will use RASampler. + Default: False. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. @@ -68,7 +72,9 @@ def build_dataloader(dataset, 'split_huge_listfile_byrank', False) - if hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1: + if use_repeated_augment_sampler: + sampler = RASampler(dataset, world_size, rank, shuffle=shuffle) + elif hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1: sampler = DistributedMPSampler( dataset, world_size, @@ -88,7 +94,10 @@ def build_dataloader(dataset, else: if replace: raise NotImplementedError - if hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1: + + if use_repeated_augment_sampler: + sampler = RASampler(dataset, 1, 0, shuffle=shuffle) + elif hasattr(dataset, 'm_per_class') and dataset.m_per_class > 1: sampler = DistributedMPSampler( dataset, 1, 0, shuffle=shuffle, replace=replace) else: diff --git a/easycv/datasets/loader/sampler.py b/easycv/datasets/loader/sampler.py index 6fe6863c..fd39d054 100644 --- a/easycv/datasets/loader/sampler.py +++ b/easycv/datasets/loader/sampler.py @@ -6,10 +6,13 @@ import random import numpy as np import torch +import torch.distributed as dist from mmcv.runner import get_dist_info from torch.utils.data import DistributedSampler as _DistributedSampler from torch.utils.data import Sampler +from easycv.framework.errors import ValueError + class DistributedMPSampler(_DistributedSampler): @@ -83,7 +86,9 @@ class DistributedMPSampler(_DistributedSampler): self.label_list = [] if not self.dataset.data_source.has_labels: - raise 'MPSampler need initial with classification datasets which has label!' + raise ValueError( + 'MPSampler need initial with classification datasets which has label!' + ) for idx, label in enumerate(self.dataset.data_source.labels): if label in self.label_dict.keys(): @@ -469,3 +474,73 @@ class DistributedGivenIterationSampler(Sampler): def set_epoch(self, epoch): pass + + +class RASampler(torch.utils.data.Sampler): + """Sampler that restricts data loading to a subset of the dataset for distributed, + with repeated augmentation. + It ensures that different each augmented version of a sample will be visible to a + different process (GPU) + Heavily based on torch.utils.data.DistributedSampler + """ + + def __init__(self, + dataset, + num_replicas=None, + rank=None, + shuffle=True, + num_repeats: int = 3): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError( + 'Requires distributed package to be available') + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError( + 'Requires distributed package to be available') + rank = dist.get_rank() + if num_repeats < 1: + raise ValueError('num_repeats should be greater than 0') + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.num_repeats = num_repeats + self.epoch = 0 + self.num_samples = int( + math.ceil( + len(self.dataset) * self.num_repeats / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) + self.num_selected_samples = int( + math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) + self.shuffle = shuffle + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g) + else: + indices = torch.arange(start=0, end=len(self.dataset)) + + # add extra samples to make it evenly divisible + indices = torch.repeat_interleave( + indices, repeats=self.num_repeats, dim=0).tolist() + padding_size: int = self.total_size - len(indices) + if padding_size > 0: + indices += indices[:padding_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices[:self.num_selected_samples]) + + def __len__(self): + return self.num_selected_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/easycv/datasets/pose/data_sources/coco.py b/easycv/datasets/pose/data_sources/coco.py index 86db5dd0..0f9e9260 100644 --- a/easycv/datasets/pose/data_sources/coco.py +++ b/easycv/datasets/pose/data_sources/coco.py @@ -7,6 +7,7 @@ import json_tricks as json import numpy as np from easycv.datasets.registry import DATASOURCES +from easycv.framework.errors import ValueError from .top_down import PoseTopDownSource COCO_DATASET_INFO = dict( diff --git a/easycv/datasets/pose/data_sources/top_down.py b/easycv/datasets/pose/data_sources/top_down.py index f892bc4c..3f20d7b3 100644 --- a/easycv/datasets/pose/data_sources/top_down.py +++ b/easycv/datasets/pose/data_sources/top_down.py @@ -12,6 +12,7 @@ from mmcv.utils.path import is_filepath from xtcocotools.coco import COCO from easycv.datasets.registry import DATASOURCES +from easycv.framework.errors import ValueError class DatasetInfo: diff --git a/easycv/datasets/pose/hand_coco_wholebody_dataset.py b/easycv/datasets/pose/hand_coco_wholebody_dataset.py index 3084ba02..5cbd65a7 100644 --- a/easycv/datasets/pose/hand_coco_wholebody_dataset.py +++ b/easycv/datasets/pose/hand_coco_wholebody_dataset.py @@ -6,6 +6,7 @@ from easycv.core.evaluation.keypoint_eval import KeyPointEvaluator from easycv.datasets.pose.data_sources.coco import PoseTopDownSource from easycv.datasets.registry import DATASETS from easycv.datasets.shared.base import BaseDataset +from easycv.framework.errors import ValueError @DATASETS.register_module() diff --git a/easycv/datasets/pose/pipelines/transforms.py b/easycv/datasets/pose/pipelines/transforms.py index 27c7c325..8401ee8f 100644 --- a/easycv/datasets/pose/pipelines/transforms.py +++ b/easycv/datasets/pose/pipelines/transforms.py @@ -9,6 +9,7 @@ from easycv.core.post_processing import (affine_transform, fliplr_joints, get_affine_transform, get_warp_matrix, warp_affine_joints) from easycv.datasets.registry import PIPELINES +from easycv.framework.errors import ValueError @PIPELINES.register_module() diff --git a/easycv/datasets/pose/top_down.py b/easycv/datasets/pose/top_down.py index 3d972208..1946a654 100644 --- a/easycv/datasets/pose/top_down.py +++ b/easycv/datasets/pose/top_down.py @@ -3,6 +3,7 @@ from easycv.core.evaluation.coco_evaluation import CoCoPoseTopDownEvaluator from easycv.datasets.pose.data_sources.coco import PoseTopDownSource from easycv.datasets.registry import DATASETS from easycv.datasets.shared.base import BaseDataset +from easycv.framework.errors import ValueError @DATASETS.register_module() diff --git a/easycv/datasets/segmentation/data_sources/base.py b/easycv/datasets/segmentation/data_sources/base.py index 888fd477..a893932e 100644 --- a/easycv/datasets/segmentation/data_sources/base.py +++ b/easycv/datasets/segmentation/data_sources/base.py @@ -12,6 +12,7 @@ from tqdm import tqdm from easycv.datasets.registry import DATASOURCES from easycv.file.image import load_image as _load_img +from easycv.framework.errors import NotImplementedError, ValueError def load_image(img_path): @@ -26,7 +27,7 @@ def load_image(img_path): def load_seg_map(seg_path, reduce_zero_label): - gt_semantic_seg = _load_img(seg_path, mode='RGB') + gt_semantic_seg = _load_img(seg_path, mode='P') # reduce zero_label if reduce_zero_label: # avoid using underflow conversion diff --git a/easycv/datasets/selfsup/data_sources/image_list.py b/easycv/datasets/selfsup/data_sources/image_list.py index fa61de5d..93637b57 100644 --- a/easycv/datasets/selfsup/data_sources/image_list.py +++ b/easycv/datasets/selfsup/data_sources/image_list.py @@ -7,6 +7,7 @@ from PIL import Image, ImageFile from easycv.datasets.registry import DATASOURCES from easycv.file import io +from easycv.framework.errors import ValueError @DATASOURCES.register_module diff --git a/easycv/datasets/shared/multi_view.py b/easycv/datasets/shared/multi_view.py index 7c96d9d6..b5bbd43f 100644 --- a/easycv/datasets/shared/multi_view.py +++ b/easycv/datasets/shared/multi_view.py @@ -7,6 +7,7 @@ from easycv.datasets.builder import build_datasource from easycv.datasets.registry import DATASETS, PIPELINES from easycv.datasets.shared.base import BaseDataset from easycv.datasets.shared.pipelines.transforms import Compose +from easycv.framework.errors import NotImplementedError from easycv.utils.registry import build_from_cfg diff --git a/easycv/datasets/shared/pipelines/format.py b/easycv/datasets/shared/pipelines/format.py index 22a463e2..d46e0c34 100644 --- a/easycv/datasets/shared/pipelines/format.py +++ b/easycv/datasets/shared/pipelines/format.py @@ -7,6 +7,7 @@ import torch from mmcv.parallel import DataContainer as DC from easycv.datasets.registry import PIPELINES +from easycv.framework.errors import TypeError def to_tensor(data): diff --git a/easycv/datasets/shared/pipelines/transforms.py b/easycv/datasets/shared/pipelines/transforms.py index 7de5c0a9..31e4a966 100644 --- a/easycv/datasets/shared/pipelines/transforms.py +++ b/easycv/datasets/shared/pipelines/transforms.py @@ -6,6 +6,7 @@ import numpy as np from easycv.datasets.registry import PIPELINES from easycv.file.image import load_image +from easycv.framework.errors import TypeError from easycv.utils.registry import build_from_cfg diff --git a/easycv/datasets/shared/raw.py b/easycv/datasets/shared/raw.py index ed30275c..10f5e0a3 100644 --- a/easycv/datasets/shared/raw.py +++ b/easycv/datasets/shared/raw.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from easycv.datasets.registry import DATASETS +from easycv.framework.errors import NotImplementedError from .base import BaseDataset diff --git a/easycv/file/base.py b/easycv/file/base.py index 291e219c..47ee5d73 100644 --- a/easycv/file/base.py +++ b/easycv/file/base.py @@ -9,6 +9,8 @@ from datetime import datetime from functools import lru_cache from typing import List, Union +from easycv.framework.errors import NotImplementedError + class IOBase: diff --git a/easycv/file/file_io.py b/easycv/file/file_io.py index 50e052eb..ac3f5d28 100644 --- a/easycv/file/file_io.py +++ b/easycv/file/file_io.py @@ -11,6 +11,8 @@ from typing import List, Union from tqdm import tqdm from tqdm.utils import CallbackIOWrapper +from easycv.framework.errors import (FileNotFoundError, IOError, RuntimeError, + ValueError) from .base import IOLocal from .utils import (OSS_PREFIX, create_namedtuple, get_oss_config, is_oss_path, mute_stderr, oss_progress) @@ -198,7 +200,7 @@ class IO(IOLocal): time.sleep(3) if data is None: - raise ValueError('Read file error: %s!' % full_path) + raise IOError('Read file error: %s!' % full_path) if mode == 'rb': return NullContextWrapper(BytesIO(data)) @@ -519,6 +521,11 @@ class IO(IOLocal): ] if path in files: files.remove(path) + if recursive: + files = [ + i for i in files + if not self.isdir(f'{OSS_PREFIX}{bucket.bucket_name}/{i}') + ] if not files and not self._obj_exists(bucket, path): raise FileNotFoundError( diff --git a/easycv/file/image.py b/easycv/file/image.py index 2b5420b2..3a1fff90 100644 --- a/easycv/file/image.py +++ b/easycv/file/image.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import io import logging import time @@ -6,9 +7,10 @@ import cv2 import numpy as np from PIL import Image -from easycv.file import io +from easycv import file +from easycv.framework.errors import IOError from easycv.utils.constant import MAX_READ_IMAGE_TRY_TIMES -from .utils import is_oss_path +from .utils import is_oss_path, is_url_path def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES): @@ -19,16 +21,31 @@ def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES): img = None while try_cnt < max_try_times: try: - with io.open(img_path, 'rb') as infile: - # cv2.imdecode may corrupt when the img is broken - image = Image.open(infile) # RGB + if is_url_path(img_path): + from mmcv.fileio.file_client import HTTPBackend + client = HTTPBackend() + img_bytes = client.get(img_path) + buff = io.BytesIO(img_bytes) + image = Image.open(buff) + if mode.upper() != 'BGR' and image.mode.upper() != mode.upper( + ): + image = image.convert(mode.upper()) img = np.asarray(image, dtype=np.uint8) - if mode.upper() == 'BGR': - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - assert mode.upper() in ['RGB', 'BGR' - ], 'Only support `RGB` and `BGR` mode!' - assert img is not None - break + else: + with file.io.open(img_path, 'rb') as infile: + # cv2.imdecode may corrupt when the img is broken + image = Image.open(infile) + if mode.upper() != 'BGR' and image.mode.upper( + ) != mode.upper(): + image = image.convert(mode.upper()) + img = np.asarray(image, dtype=np.uint8) + + if mode.upper() == 'BGR': + if image.mode.upper() != 'RGB': + image = image.convert('RGB') + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + assert img is not None + break except Exception as e: logging.error(e) logging.warning('Read file {} fault, try count : {}'.format( @@ -43,6 +60,6 @@ def load_image(img_path, mode='BGR', max_try_times=MAX_READ_IMAGE_TRY_TIMES): try_cnt += 1 if img is None: - raise ValueError('Read Image Error: ' + img_path) + raise IOError('Read Image Error: ' + img_path) return img diff --git a/easycv/file/utils.py b/easycv/file/utils.py index dcf13f4c..f943c725 100644 --- a/easycv/file/utils.py +++ b/easycv/file/utils.py @@ -10,8 +10,10 @@ from io import StringIO from tqdm import tqdm +from easycv.framework.errors import ValueError + OSS_PREFIX = 'oss://' -URL_PREFIX = 'https://' +URL_PREFIX = ('https://', 'http://') def create_namedtuple(**kwargs): @@ -31,6 +33,7 @@ def url_path_exists(url): urllib.request.urlopen(url).code except Exception as err: print(err) + return False return True diff --git a/easycv/framework/__init__.py b/easycv/framework/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/easycv/framework/errors.py b/easycv/framework/errors.py new file mode 100644 index 00000000..4fa8e8b7 --- /dev/null +++ b/easycv/framework/errors.py @@ -0,0 +1,128 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +# total 64 bit +# 63~64 (question category): 01 (user), ... +# 60~62 (error severity): 001 (ERROR), 010(WARNING), 011(INFO), 100 (DEBUG), ... +# 54~59 (product): 00000011 (PAI) +# 49~53 (sub product): 00000 (none) +# 41~48 (framework): 00000001 (tensorflow), 00000010 (pytorch) +# 1~40 (error code) +OK = 0x5818008000000000 +RUNTIME = 0x4818008000000001 +UNIMPLEMENTED = 0x4818008000000002 +INVALID_ARGUMENT = 0x4818008000000003 +INVALID_VALUE = 0x4818008000000004 +INVALID_KEY = 0x4818008000000005 +INVALID_TYPE = 0x4818008000000006 +MODULE_NOT_FOUND = 0x4818008000000007 +FILE_NOT_FOUND = 0x4818008000000008 +IO_FAILED = 0x4818008000000009 +PERMISSION_DENIED = 0x481800800000000a +TIMEOUT = 0x481800800000000b + + +class BaseError(Exception): + """The base error class for exceptions. + """ + code = None + + def __init__(self, message='', details=None, op=None): + """Creates a new `OpError` indicating that a particular op failed. + + Args: + message: The message string describing the failure. + details: The help message that handle the error. + op: The `ops.Operation` that failed, if known; otherwise None. During + eager execution, this field is always `None`. + """ + super(BaseError, self).__init__() + self._op = op + self._message = message + self._details = details + + @property + def message(self): + """The error message that describes the error.""" + return self._message + + @property + def details(self): + """The help message that handle the error.""" + return self._details + + @property + def op(self): + """The operation that failed, if known. + Returns: + The `Operation` that failed, or None. + """ + return self._op + + @property + def error_code(self): + """The integer error code that describes the error.""" + return hex(self.code) + + def __str__(self): + print_str = 'ErrorCode: ' + self.error_code + if self.op is not None: + print_str += '\n' + 'Operation: ' + str(self.op) + print_str += '\n' + 'Message: ' + self.message + if self.details is not None: + print_str += '\n' + 'Details: ' + self.details + return print_str + + +class NotImplementedError(BaseError): + """Raised when an operation has not been implemented.""" + code = UNIMPLEMENTED + + +class RuntimeError(BaseError): + """Raised when the system experiences an internal error.""" + code = RUNTIME + + +class PermissionDeniedError(BaseError): + """Raised when the caller does not have permission to run an operation.""" + code = PERMISSION_DENIED + + +class FileNotFoundError(BaseError): + """Raised when a requested entity was not found.""" + code = FILE_NOT_FOUND + + +class ModuleNotFoundError(BaseError): + """Raised when a module could not be located.""" + code = MODULE_NOT_FOUND + + +class InvalidArgumentError(BaseError): + """Raised when an operation receives an invalid argument.""" + code = INVALID_ARGUMENT + + +class TimeoutError(BaseError): + """Raised when an operation timed out.""" + code = TIMEOUT + + +class IOError(BaseError): + """Raised when an operation returns a system-related error, including I/O failures.""" + code = IO_FAILED + + +class ValueError(BaseError): + """Raised when an operation receives an invalid value.""" + code = INVALID_VALUE + + +class KeyError(BaseError): + """Raised when a mapping (dictionary) key is not found in the set of existing keys.""" + code = INVALID_KEY + + +class TypeError(BaseError): + """Raised when an operation or function is applied to an object of inappropriate type.""" + code = INVALID_TYPE diff --git a/easycv/hooks/__init__.py b/easycv/hooks/__init__.py index c292038f..f2814dac 100644 --- a/easycv/hooks/__init__.py +++ b/easycv/hooks/__init__.py @@ -13,7 +13,8 @@ from .eval_hook import DistEvalHook, EvalHook from .export_hook import ExportHook from .extractor import Extractor from .logger import PreLoggerHook -from .lr_update_hook import StepFixCosineAnnealingLrUpdaterHook +from .lr_update_hook import (CosineAnnealingWarmupByEpochLrUpdaterHook, + StepFixCosineAnnealingLrUpdaterHook) from .optimizer_hook import OptimizerHook from .oss_sync_hook import OSSSyncHook from .registry import HOOKS @@ -33,7 +34,8 @@ __all__ = [ 'OSSSyncHook', 'HOOKS', 'TIMEHook', 'SWAVHook', 'SyncNormHook', 'SyncRandomSizeHook', 'TensorboardLoggerHookV2', 'WandbLoggerHookV2', 'YOLOXLrUpdaterHook', 'YOLOXModeSwitchHook', 'MixupCollateHook', - 'PreLoggerHook', 'StepFixCosineAnnealingLrUpdaterHook', 'ThroughputHook' + 'PreLoggerHook', 'StepFixCosineAnnealingLrUpdaterHook', + 'CosineAnnealingWarmupByEpochLrUpdaterHook', 'ThroughputHook' ] if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'): diff --git a/easycv/hooks/eval_hook.py b/easycv/hooks/eval_hook.py index e221e160..a4065617 100644 --- a/easycv/hooks/eval_hook.py +++ b/easycv/hooks/eval_hook.py @@ -7,6 +7,7 @@ from mmcv.runner import Hook from torch.utils.data import DataLoader from easycv.datasets.loader.loader_wrapper import TorchaccLoaderWrapper +from easycv.framework.errors import TypeError from easycv.hooks.tensorboard import TensorboardLoggerHookV2 from easycv.hooks.wandb import WandbLoggerHookV2 diff --git a/easycv/hooks/extractor.py b/easycv/hooks/extractor.py index 6e6acafc..e81be9ae 100644 --- a/easycv/hooks/extractor.py +++ b/easycv/hooks/extractor.py @@ -2,6 +2,7 @@ import torch.nn as nn from torch.utils.data import Dataset +from easycv.framework.errors import TypeError from easycv.utils.collect import dist_forward_collect, nondist_forward_collect diff --git a/easycv/hooks/lr_update_hook.py b/easycv/hooks/lr_update_hook.py index 39ca8f53..af1bc514 100644 --- a/easycv/hooks/lr_update_hook.py +++ b/easycv/hooks/lr_update_hook.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from mmcv import runner from mmcv.runner import HOOKS from mmcv.runner.hooks.lr_updater import (CosineAnnealingLrUpdaterHook, annealing_cos) @@ -54,3 +55,29 @@ class StepFixCosineAnnealingLrUpdaterHook(CosineAnnealingLrUpdaterHook): target_lr = self.min_lr return annealing_cos(base_lr, target_lr, progress / max_progress) + + +@HOOKS.register_module() +class CosineAnnealingWarmupByEpochLrUpdaterHook(CosineAnnealingLrUpdaterHook): + + def before_train_iter(self, runner: 'runner.BaseRunner'): + cur_iter = runner.iter + epoch_len = len(runner.data_loader) + assert isinstance(self.warmup_iters, int) + if not self.by_epoch: + self.regular_lr = self.get_regular_lr(runner) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + if cur_iter % epoch_len == 0: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + if cur_iter % epoch_len == 0: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) diff --git a/easycv/hooks/optimizer_hook.py b/easycv/hooks/optimizer_hook.py index 44a6d49b..e31be411 100644 --- a/easycv/hooks/optimizer_hook.py +++ b/easycv/hooks/optimizer_hook.py @@ -6,6 +6,7 @@ import torch from mmcv.parallel import is_module_wrapper from mmcv.runner import OptimizerHook as _OptimizerHook +from easycv.framework.errors import TypeError from easycv.utils.dist_utils import get_dist_info from easycv.utils.torchacc_util import is_torchacc_enabled @@ -134,7 +135,7 @@ class AMPFP16OptimizerHook(OptimizerHook): elif isinstance(loss_scale, dict): self.scaler = amp.GradScaler(**loss_scale) else: - raise ValueError( + raise TypeError( '`loss_scale` type must be in [float, dict], but got {loss_scale}' ) diff --git a/easycv/models/backbones/__init__.py b/easycv/models/backbones/__init__.py index c59a74fb..625f5282 100644 --- a/easycv/models/backbones/__init__.py +++ b/easycv/models/backbones/__init__.py @@ -21,4 +21,5 @@ from .resnet_jit import ResNetJIT from .resnext import ResNeXt from .shuffle_transformer import ShuffleTransformer from .swin_transformer import SwinTransformer +from .vision_transformer import VisionTransformer from .vitdet import ViTDet diff --git a/easycv/models/backbones/conv_mae_vit.py b/easycv/models/backbones/conv_mae_vit.py index 02755faf..ce9b7b61 100644 --- a/easycv/models/backbones/conv_mae_vit.py +++ b/easycv/models/backbones/conv_mae_vit.py @@ -10,7 +10,7 @@ from timm.models.layers import trunc_normal_ from easycv.models.registry import BACKBONES from easycv.models.utils import DropPath from easycv.models.utils.pos_embed import get_2d_sincos_pos_embed -from .vit_transfomer_dynamic import Block +from .vision_transformer import Block class PatchEmbed(nn.Module): diff --git a/easycv/models/backbones/hrnet.py b/easycv/models/backbones/hrnet.py index 90730d02..09cb2198 100644 --- a/easycv/models/backbones/hrnet.py +++ b/easycv/models/backbones/hrnet.py @@ -7,6 +7,7 @@ from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, normal_init) from torch.nn.modules.batchnorm import _BatchNorm +from easycv.framework.errors import NotImplementedError, TypeError, ValueError from easycv.models.registry import BACKBONES from ..modelzoo import hrnet as model_urls from .resnet import BasicBlock diff --git a/easycv/models/backbones/lighthrnet.py b/easycv/models/backbones/lighthrnet.py index 503f9d24..13846e0e 100644 --- a/easycv/models/backbones/lighthrnet.py +++ b/easycv/models/backbones/lighthrnet.py @@ -11,6 +11,7 @@ from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, normal_init) from torch.nn.modules.batchnorm import _BatchNorm +from easycv.framework.errors import ValueError from easycv.models.registry import BACKBONES diff --git a/easycv/models/backbones/mit.py b/easycv/models/backbones/mit.py index a9cf2c01..c957b737 100644 --- a/easycv/models/backbones/mit.py +++ b/easycv/models/backbones/mit.py @@ -13,6 +13,7 @@ from mmcv.cnn.utils.weight_init import (constant_init, normal_init, trunc_normal_init) from mmcv.runner import BaseModule, ModuleList, Sequential +from easycv.framework.errors import TypeError from easycv.models.registry import BACKBONES from easycv.models.segmentation.utils import (PatchEmbed, nchw_to_nlc, nlc_to_nchw) diff --git a/easycv/models/backbones/mobilenetv2.py b/easycv/models/backbones/mobilenetv2.py index 860b26d9..283cd55a 100644 --- a/easycv/models/backbones/mobilenetv2.py +++ b/easycv/models/backbones/mobilenetv2.py @@ -5,6 +5,7 @@ r""" This model is taken from the official PyTorch model zoo. from torch import nn +from easycv.framework.errors import ValueError from ..modelzoo import mobilenetv2 as model_urls from ..registry import BACKBONES diff --git a/easycv/models/backbones/pytorch_image_models_wrapper.py b/easycv/models/backbones/pytorch_image_models_wrapper.py index 6b141489..1072056d 100644 --- a/easycv/models/backbones/pytorch_image_models_wrapper.py +++ b/easycv/models/backbones/pytorch_image_models_wrapper.py @@ -7,6 +7,7 @@ import torch.nn as nn from timm.models.helpers import load_pretrained from timm.models.hub import download_cached_file +from easycv.framework.errors import ValueError from easycv.utils.logger import get_root_logger, print_log from ..modelzoo import timm_models as model_urls from ..registry import BACKBONES @@ -16,11 +17,11 @@ from .shuffle_transformer import (shuffletrans_base_p4_w7_224, from .swin_transformer_dynamic import (dynamic_swin_base_p4_w7_224, dynamic_swin_small_p4_w7_224, dynamic_swin_tiny_p4_w7_224) -from .vit_transfomer_dynamic import (dynamic_deit_small_p16, - dynamic_deit_tiny_p16, - dynamic_vit_base_p16, - dynamic_vit_huge_p14, - dynamic_vit_large_p16) +from .vit_transformer_dynamic import (dynamic_deit_small_p16, + dynamic_deit_tiny_p16, + dynamic_vit_base_p16, + dynamic_vit_huge_p14, + dynamic_vit_large_p16) from .xcit_transformer import (xcit_large_24_p8, xcit_medium_24_p8, xcit_medium_24_p16, xcit_small_12_p8, xcit_small_12_p16) @@ -36,7 +37,7 @@ _MODEL_MAP = { 'dynamic_swin_small_p4_w7_224': dynamic_swin_small_p4_w7_224, 'dynamic_swin_base_p4_w7_224': dynamic_swin_base_p4_w7_224, - # vit_transfomer_dynamic + # vit_transformer_dynamic 'dynamic_deit_small_p16': dynamic_deit_small_p16, 'dynamic_deit_tiny_p16': dynamic_deit_tiny_p16, 'dynamic_vit_base_p16': dynamic_vit_base_p16, diff --git a/easycv/models/backbones/resnest.py b/easycv/models/backbones/resnest.py index 13ef9987..6bd5a08e 100644 --- a/easycv/models/backbones/resnest.py +++ b/easycv/models/backbones/resnest.py @@ -14,6 +14,7 @@ import torch.nn.functional as F from torch.nn import Conv2d, Module, ReLU from torch.nn.modules.utils import _pair +from easycv.framework.errors import KeyError, NotImplementedError, RuntimeError from ..registry import BACKBONES diff --git a/easycv/models/backbones/resnet.py b/easycv/models/backbones/resnet.py index 6e083f86..ca5c959f 100644 --- a/easycv/models/backbones/resnet.py +++ b/easycv/models/backbones/resnet.py @@ -4,6 +4,7 @@ import torch.utils.checkpoint as cp from mmcv.cnn import constant_init, kaiming_init from torch.nn.modules.batchnorm import _BatchNorm +from easycv.framework.errors import KeyError from ..modelzoo import resnet as model_urls from ..registry import BACKBONES from ..utils import FReLU, build_conv_layer, build_norm_layer diff --git a/easycv/models/backbones/resnet_jit.py b/easycv/models/backbones/resnet_jit.py index 0d55e59a..4e241f86 100644 --- a/easycv/models/backbones/resnet_jit.py +++ b/easycv/models/backbones/resnet_jit.py @@ -6,6 +6,7 @@ import torch.nn as nn from mmcv.cnn import constant_init, kaiming_init from torch.nn.modules.batchnorm import _BatchNorm +from easycv.framework.errors import KeyError from ..registry import BACKBONES from ..utils import build_conv_layer, build_norm_layer diff --git a/easycv/models/backbones/shuffle_transformer.py b/easycv/models/backbones/shuffle_transformer.py index 965df17e..c53c103d 100644 --- a/easycv/models/backbones/shuffle_transformer.py +++ b/easycv/models/backbones/shuffle_transformer.py @@ -7,6 +7,7 @@ from einops import rearrange from timm.models.layers import DropPath, trunc_normal_ from torch import nn +from easycv.framework.errors import NotImplementedError from ..registry import BACKBONES diff --git a/easycv/models/backbones/vision_transformer.py b/easycv/models/backbones/vision_transformer.py new file mode 100644 index 00000000..79a9c900 --- /dev/null +++ b/easycv/models/backbones/vision_transformer.py @@ -0,0 +1,283 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" +Mostly copy-paste from timm library. +https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + +""" +from functools import partial + +import torch +import torch.nn as nn +from timm.models.layers import trunc_normal_ + +from easycv.models.utils import DropPath, Mlp +from ..registry import BACKBONES + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn + + +class Block(nn.Module): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + use_layer_scale=False, + init_values=1e-4): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.use_layer_scale = use_layer_scale + if self.use_layer_scale: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True) + + def forward(self, x, return_attention=False, rel_pos_bias=None): + y, attn = self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) + if return_attention: + return attn + if self.use_layer_scale: + x = x + self.drop_path(self.gamma_1 * y) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(y) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + def forward_fea_and_attn(self, x): + y, attn = self.attn(self.norm1(x)) + if self.use_layer_scale: + x = x + self.drop_path(self.gamma_1 * y) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(y) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x, attn + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + num_patches = (img_size // patch_size) * (img_size // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +@BACKBONES.register_module +class VisionTransformer(nn.Module): + """ DeiT III is based on ViT. It uses some strategies to make the vit model + better, just like layer scale, stochastic depth, 3-Augment. + + Paper link: https://arxiv.org/pdf/2204.07118.pdf (DeiT III: Revenge of the ViT) + + Args: + img_size (list): Input image size. img_size=[224] means the image size is + 224*224. img_size=[192, 224] means the image size is 192*224. + patch_size (int): The patch size. Default: 16 + in_chans (int): The num of input channels. Default: 3 + num_classes (int): The num of picture classes. Default: 1000 + embed_dim (int): The dimensions of embedding. Default: 768 + depth (int): The num of blocks. Default: 12 + num_heads (int): Parallel attention heads. Default: 12 + mlp_ratio (float): Mlp expansion ratio. Default: 4.0 + qkv_bias (bool): Does kqv use bias. Default: False + qk_scale (float | None): In the step of self-attention, if qk_scale is not + None, it will use qk_scale to scale the q @ k. Otherwise it will use + head_dim**-0.5 instead of qk_scale. Default: None + drop_rate (float): Probability of an element to be zeroed after the feed + forward layer. Default: 0.0 + drop_path_rate (float): Stochastic depth rate. Default: 0 + norm_layer (nn.Module): normalization layer + global_pool (bool): Global pool before head. Default: False + use_layer_scale (bool): If use_layer_scale is True, it will use layer + scale. Default: False + init_scale (float): It is used for layer scale in Block to scale the + gamma_1 and gamma_2. + + """ + + def __init__(self, + img_size=[224], + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), + global_pool=False, + use_layer_scale=False, + init_scale=1e-4, + **kwargs): + super().__init__() + + self.num_features = self.embed_dim = embed_dim + self.num_heads = num_heads + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.norm_layer = norm_layer + self.use_layer_scale = use_layer_scale + self.init_scale = init_scale + + self.patch_embed = PatchEmbed( + img_size=img_size[0], + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + self.drop_path_rate = drop_path_rate + self.depth = depth + dpr = [drop_path_rate for i in range(depth)] + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + use_layer_scale=use_layer_scale, + init_values=init_scale) for i in range(depth) + ]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = nn.Linear( + embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + # Use global average pooling + self.global_pool = global_pool + if self.global_pool: + self.fc_norm = norm_layer(embed_dim) + self.norm = None + + def init_weights(self): + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x): + + x = self.forward_features(x) + x = self.pos_drop(x) + x = self.head(x) + + return [x] + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + + x = x + self.pos_embed + x = torch.cat((cls_tokens, x), dim=1) + + for blk in self.blocks: + x = blk(x) + if self.norm is not None: + x = self.norm(x) + + if self.global_pool: + x = x[:, 1:, :].mean(dim=1) + return self.fc_norm(x) + else: + return x[:, 0] diff --git a/easycv/models/backbones/vit_transfomer_dynamic.py b/easycv/models/backbones/vit_transformer_dynamic.py similarity index 62% rename from easycv/models/backbones/vit_transfomer_dynamic.py rename to easycv/models/backbones/vit_transformer_dynamic.py index 3ff94701..3f8d0968 100644 --- a/easycv/models/backbones/vit_transfomer_dynamic.py +++ b/easycv/models/backbones/vit_transformer_dynamic.py @@ -12,198 +12,51 @@ from functools import partial import torch import torch.nn as nn -from timm.models.layers import trunc_normal_ -from easycv.models.utils import DropPath, Mlp +from easycv.models.backbones.vision_transformer import Block, VisionTransformer -class Attention(nn.Module): +class DynamicVisionTransformer(VisionTransformer): + """Dynamic Vision Transformer - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0., - proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 + Args: + use_dense_prediction (bool): If use_dense_prediction is True, the global + pool and norm will before head will be removed.(if any) Default: False - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x, rel_pos_bias=None): - B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - attn = (q @ k.transpose(-2, -1)) * self.scale - - if rel_pos_bias is not None: - attn = attn + rel_pos_bias - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x, attn - - -class Block(nn.Module): - - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm): - super().__init__() - self.norm1 = norm_layer(dim) - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop) - self.drop_path = DropPath( - drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp( - in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) - - def forward(self, x, return_attention=False, rel_pos_bias=None): - y, attn = self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) - if return_attention: - return attn - x = x + self.drop_path(y) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - def forward_fea_and_attn(self, x): - y, attn = self.attn(self.norm1(x)) - x = x + self.drop_path(y) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x, attn - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - num_patches = (img_size // patch_size) * (img_size // patch_size) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches + def __init__(self, use_dense_prediction=False, **kwargs): + super(DynamicVisionTransformer, self).__init__(**kwargs) - self.proj = nn.Conv2d( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x): - B, C, H, W = x.shape - x = self.proj(x).flatten(2).transpose(1, 2) - return x - - -class DynamicVisionTransformer(nn.Module): - """Dynamic Vision Transformer """ - - def __init__(self, - img_size=[224], - patch_size=16, - in_chans=3, - num_classes=0, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - norm_layer=nn.LayerNorm, - use_dense_prediction=False, - global_pool=False, - **kwargs): - super().__init__() - self.num_features = self.embed_dim = embed_dim - - self.patch_embed = PatchEmbed( - img_size=img_size[0], - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim) num_patches = self.patch_embed.num_patches - self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter( - torch.zeros(1, num_patches + 1, embed_dim)) - self.pos_drop = nn.Dropout(p=drop_rate) + torch.zeros(1, num_patches + 1, self.embed_dim)) - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) - ] # stochastic depth decay rule + dpr = [ + x.item() + for x in torch.linspace(0, self.drop_path_rate, self.depth) + ] self.blocks = nn.ModuleList([ Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, + dim=self.embed_dim, + num_heads=self.num_heads, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, + drop=self.drop_rate, + attn_drop=self.attn_drop_rate, drop_path=dpr[i], - norm_layer=norm_layer) for i in range(depth) + norm_layer=self.norm_layer, + use_layer_scale=self.use_layer_scale, + init_values=self.init_scale) for i in range(self.depth) ]) - self.norm = norm_layer(embed_dim) - - # Classifier head - self.head = nn.Linear( - embed_dim, num_classes) if num_classes > 0 else nn.Identity() # Dense prediction head self.use_dense_prediction = use_dense_prediction if self.use_dense_prediction: self.head_dense = None - -# Use global average pooling - self.global_pool = global_pool - if self.global_pool: - self.fc_norm = norm_layer(embed_dim) - self.norm = None - - trunc_normal_(self.pos_embed, std=.02) - trunc_normal_(self.cls_token, std=.02) - - def init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) - def forward(self, x): # convert to list if not isinstance(x, list): diff --git a/easycv/models/backbones/vitdet.py b/easycv/models/backbones/vitdet.py index 83e11efa..9380f740 100644 --- a/easycv/models/backbones/vitdet.py +++ b/easycv/models/backbones/vitdet.py @@ -1,5 +1,3 @@ -# Copyright 2018-2023 OpenMMLab. All rights reserved. -# Reference: https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmdet/models/backbones/vit.py import math from functools import partial @@ -7,793 +5,466 @@ import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint -from mmcv.cnn import build_norm_layer, constant_init, kaiming_init -from mmcv.runner import get_dist_info -from timm.models.layers import to_2tuple, trunc_normal_ -from torch.nn.modules.batchnorm import _BatchNorm +from timm.models.layers import DropPath, trunc_normal_ -from easycv.models.utils import DropPath, Mlp +from easycv.models.utils import Mlp from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from ..registry import BACKBONES -from ..utils import build_conv_layer - - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__(self, - inplanes, - planes, - stride=1, - dilation=1, - conv_cfg=None, - norm_cfg=dict(type='BN')): - super(BasicBlock, self).__init__() - - self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) - self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) - - self.conv1 = build_conv_layer( - conv_cfg, - inplanes, - planes, - 3, - stride=stride, - padding=dilation, - dilation=dilation, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - conv_cfg, planes, planes, 3, padding=1, bias=False) - self.add_module(self.norm2_name, norm2) - - self.relu = nn.ReLU(inplace=True) - self.stride = stride - self.dilation = dilation - - @property - def norm1(self): - return getattr(self, self.norm1_name) - - @property - def norm2(self): - return getattr(self, self.norm2_name) - - def forward(self, x, H, W): - B, _, C = x.shape - x = x.permute(0, 2, 1).reshape(B, -1, H, W) - identity = x - - out = self.conv1(x) - out = self.norm1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.norm2(out) - - out += identity - out = self.relu(out) - out = out.flatten(2).transpose(1, 2) - return out - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__(self, - inplanes, - planes, - stride=1, - dilation=1, - conv_cfg=None, - norm_cfg=dict(type='BN')): - """Bottleneck block for ResNet. - If style is "pytorch", the stride-two layer is the 3x3 conv layer, - if it is "caffe", the stride-two layer is the first 1x1 conv layer. - """ - super(Bottleneck, self).__init__() - - self.inplanes = inplanes - self.planes = planes - self.stride = stride - self.dilation = dilation - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - - self.conv1_stride = 1 - self.conv2_stride = stride - - self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) - self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) - self.norm3_name, norm3 = build_norm_layer( - norm_cfg, planes * self.expansion, postfix=3) - - self.conv1 = build_conv_layer( - conv_cfg, - inplanes, - planes, - kernel_size=1, - stride=self.conv1_stride, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - conv_cfg, - planes, - planes, - kernel_size=3, - stride=self.conv2_stride, - padding=dilation, - dilation=dilation, - bias=False) - self.add_module(self.norm2_name, norm2) - self.conv3 = build_conv_layer( - conv_cfg, - planes, - planes * self.expansion, - kernel_size=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - self.relu = nn.ReLU(inplace=True) - - @property - def norm1(self): - return getattr(self, self.norm1_name) - - @property - def norm2(self): - return getattr(self, self.norm2_name) - - @property - def norm3(self): - return getattr(self, self.norm3_name) - - def forward(self, x, H, W): - B, _, C = x.shape - x = x.permute(0, 2, 1).reshape(B, -1, H, W) - identity = x - - out = self.conv1(x) - out = self.norm1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.norm2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.norm3(out) - - out += identity - out = self.relu(out) - out = out.flatten(2).transpose(1, 2) - return out - - -class Attention(nn.Module): - - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0., - proj_drop=0., - window_size=None, - attn_head_dim=None): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - if attn_head_dim is not None: - head_dim = attn_head_dim - all_head_dim = head_dim * self.num_heads - # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights - self.scale = qk_scale or head_dim**-0.5 - - self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) - self.window_size = window_size - q_size = window_size[0] - kv_size = q_size - rel_sp_dim = 2 * q_size - 1 - self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, head_dim)) - - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(all_head_dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x, H, W, rel_pos_bias=None): - B, N, C = x.shape - # qkv_bias = None - # if self.q_bias is not None: - # qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) - # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) - qkv = self.qkv(x) - qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[ - 2] # make torchscript happy (cannot use tensor as tuple) - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - attn = calc_rel_pos_spatial(attn, q, self.window_size, - self.window_size, self.rel_pos_h, - self.rel_pos_w) - # if self.relative_position_bias_table is not None: - # relative_position_bias = \ - # self.relative_position_bias_table[self.relative_position_index.view(-1)].view( - # self.window_size[0] * self.window_size[1] + 1, - # self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH - # relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - # attn = attn + relative_position_bias.unsqueeze(0) - - # if rel_pos_bias is not None: - # attn = attn + rel_pos_bias - - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, -1) - x = self.proj(x) - x = self.proj_drop(x) - return x def window_partition(x, window_size): """ + Partition into non-overlapping windows with padding if needed. Args: - x: (B, H, W, C) - window_size (int): window size + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. Returns: - windows: (num_windows*B, window_size, window_size, C) + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape - x = x.view(B, H // window_size, window_size, W // window_size, window_size, - C) + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, + window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) - return windows + return windows, (Hp, Wp) -def window_reverse(windows, window_size, H, W): +def window_unpartition(windows, window_size, pad_hw, hw): """ + Window unpartition into original sequences and removing padding. Args: - windows: (num_windows*B, window_size, window_size, C) - window_size (int): Window size - H (int): Height of image - W (int): Width of image + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. Returns: - x: (B, H, W, C) + x: unpartitioned sequences with [B, H, W, C]. """ - B = int(windows.shape[0] / (H * W / window_size / window_size)) - x = windows.view(B, H // window_size, W // window_size, window_size, + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) - x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() return x -def calc_rel_pos_spatial( - attn, - q, - q_shape, - k_shape, - rel_pos_h, - rel_pos_w, -): +def get_rel_pos(q_size, k_size, rel_pos): """ - Spatial Relative Positional Embeddings. + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + Returns: + Extracted positional embeddings according to relative positions. """ - sp_idx = 0 - q_h, q_w = q_shape - k_h, k_w = k_shape + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode='linear', + ) + rel_pos_resized = rel_pos_resized.reshape(-1, + max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos - # Scale up rel pos if shapes for q and k are different. - q_h_ratio = max(k_h / q_h, 1.0) - k_h_ratio = max(q_h / k_h, 1.0) - dist_h = ( - torch.arange(q_h)[:, None] * q_h_ratio - - torch.arange(k_h)[None, :] * k_h_ratio) - dist_h += (k_h - 1) * k_h_ratio - q_w_ratio = max(k_w / q_w, 1.0) - k_w_ratio = max(q_w / k_w, 1.0) - dist_w = ( - torch.arange(q_w)[:, None] * q_w_ratio - - torch.arange(k_w)[None, :] * k_w_ratio) - dist_w += (k_w - 1) * k_w_ratio + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - + k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) - Rh = rel_pos_h[dist_h.long()] - Rw = rel_pos_w[dist_w.long()] + return rel_pos_resized[relative_coords.long()] - B, n_head, q_N, dim = q.shape - r_q = q[:, :, sp_idx:].reshape(B, n_head, q_h, q_w, dim) - rel_h = torch.einsum('byhwc,hkc->byhwk', r_q, Rh) - rel_w = torch.einsum('byhwc,wkc->byhwk', r_q, Rw) +def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size): + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) - attn[:, :, sp_idx:, sp_idx:] = ( - attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_h, q_w, k_h, k_w) + - rel_h[:, :, :, :, :, None] + rel_w[:, :, :, :, None, :]).view( - B, -1, q_h * q_w, k_h * k_w) + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum('bhwc,hkc->bhwk', r_q, Rh) + rel_w = torch.einsum('bhwc,wkc->bhwk', r_q, Rw) + + attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w) return attn -class WindowAttention(nn.Module): - """ Window based multi-head self attention (W-MSA) module with relative position bias. - It supports both of shifted and non-shifted window. +def get_abs_pos(abs_pos, has_cls_token, hw): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. Args: - dim (int): Number of input channels. - window_size (tuple[int]): The height and width of the window. - num_heads (int): Number of attention heads. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set - attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 - proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + h, w = hw + if has_cls_token: + abs_pos = abs_pos[:, 1:] + xy_num = abs_pos.shape[1] + size = int(math.sqrt(xy_num)) + assert size * size == xy_num + + if size != h or size != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), + size=(h, w), + mode='bicubic', + align_corners=False, + ) + + return new_abs_pos.permute(0, 2, 3, 1) + else: + return abs_pos.reshape(1, h, w, -1) + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. """ def __init__(self, - dim, - window_size, - num_heads, - qkv_bias=True, - qk_scale=None, - attn_drop=0., - proj_drop=0., - attn_head_dim=None): - + kernel_size=(16, 16), + stride=(16, 16), + padding=(0, 0), + in_chans=3, + embed_dim=768): + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, + embed_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding) + + def forward(self, x): + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim, + num_heads=8, + qkv_bias=True, + use_rel_pos=False, + rel_pos_zero_init=True, + input_size=None, + ): + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool: If True, add a learnable bias to query, key, value. + rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (int or None): Input resolution for calculating the relative positional + parameter size. + """ super().__init__() - self.dim = dim - self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - q_size = window_size[0] - kv_size = window_size[1] - rel_sp_dim = 2 * q_size - 1 - self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, head_dim)) + self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - # trunc_normal_(self.relative_position_bias_table, std=.02) - self.softmax = nn.Softmax(dim=-1) + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter( + torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter( + torch.zeros(2 * input_size[1] - 1, head_dim)) - def forward(self, x, H, W): - """ Forward function. - Args: - x: input features with shape of (num_windows*B, N, C) - mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None - """ - B_, N, C = x.shape - x = x.reshape(B_, H, W, C) - pad_l = pad_t = 0 - pad_r = (self.window_size[1] - - W % self.window_size[1]) % self.window_size[1] - pad_b = (self.window_size[0] - - H % self.window_size[0]) % self.window_size[0] + if not rel_pos_zero_init: + trunc_normal_(self.rel_pos_h, std=0.02) + trunc_normal_(self.rel_pos_w, std=0.02) - x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) - _, Hp, Wp, _ = x.shape + def forward(self, x): + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, + -1).permute(2, 0, 3, 1, 4) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) - x = window_partition( - x, self.window_size[0]) # nW*B, window_size, window_size, C - x = x.view(-1, self.window_size[1] * self.window_size[0], - C) # nW*B, window_size*window_size, C - B_w = x.shape[0] - N_w = x.shape[1] - qkv = self.qkv(x).reshape(B_w, N_w, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[ - 2] # make torchscript happy (cannot use tensor as tuple) + attn = (q * self.scale) @ k.transpose(-2, -1) - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, + self.rel_pos_w, (H, W), (H, W)) - attn = calc_rel_pos_spatial(attn, q, self.window_size, - self.window_size, self.rel_pos_h, - self.rel_pos_w) - - attn = self.softmax(attn) - - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B_w, N_w, C) + attn = attn.softmax(dim=-1) + x = (attn @ v).view(B, self.num_heads, H, W, + -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) x = self.proj(x) - x = self.proj_drop(x) - - x = x.view(-1, self.window_size[1], self.window_size[0], C) - x = window_reverse(x, self.window_size[0], Hp, Wp) # B H' W' C - - if pad_r > 0 or pad_b > 0: - x = x[:, :H, :W, :].contiguous() - - x = x.view(B_, H * W, C) return x class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - init_values=None, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - window_size=None, - attn_head_dim=None, - window=False, - aggregation='attn'): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=0.0, + norm_layer=nn.LayerNorm, + act_layer=nn.GELU, + use_rel_pos=False, + rel_pos_zero_init=True, + window_size=0, + use_residual_block=False, + input_size=None, + ): + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then not + use window attention. + use_residual_block (bool): If True, use a residual block after the MLP block. + input_size (int or None): Input resolution for calculating the relative positional + parameter size. + """ super().__init__() self.norm1 = norm_layer(dim) - self.aggregation = aggregation - self.window = window - if not window: - if aggregation == 'attn': - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - window_size=window_size, - attn_head_dim=attn_head_dim) - else: - self.attn = WindowAttention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - window_size=window_size, - attn_head_dim=attn_head_dim) - if aggregation == 'basicblock': - self.conv_aggregation = BasicBlock( - inplanes=dim, planes=dim) - elif aggregation == 'bottleneck': - self.conv_aggregation = Bottleneck( - inplanes=dim, planes=dim // 4) - else: - self.attn = WindowAttention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - window_size=window_size, - attn_head_dim=attn_head_dim) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else + (window_size, window_size), + ) + self.drop_path = DropPath( - drop_path) if drop_path > 0. else nn.Identity() + drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer) - if init_values is not None: - self.gamma_1 = nn.Parameter( - init_values * torch.ones((dim)), requires_grad=True) - self.gamma_2 = nn.Parameter( - init_values * torch.ones((dim)), requires_grad=True) - else: - self.gamma_1, self.gamma_2 = None, None + self.window_size = window_size - def forward(self, x, H, W): - if self.gamma_1 is None: - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - x = x + self.drop_path( - self.gamma_1 * self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - if not self.window and self.aggregation != 'attn': - x = self.conv_aggregation(x, H, W) - return x - - -class PatchEmbed(nn.Module): - """ Image to Patch Embedding - """ - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * ( - img_size[0] // patch_size[0]) - self.patch_shape = (img_size[0] // patch_size[0], - img_size[1] // patch_size[1]) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2d( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x, **kwargs): - B, C, H, W = x.shape - # FIXME look at relaxing size constraints - # assert H == self.img_size[0] and W == self.img_size[1], \ - # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - x = self.proj(x) - Hp, Wp = x.shape[2], x.shape[3] - - x = x.flatten(2).transpose(1, 2) - return x, (Hp, Wp) - - -class HybridEmbed(nn.Module): - """ CNN Feature Map Embedding - Extract feature map from CNN, flatten, project to embedding dim. - """ - - def __init__(self, - backbone, - img_size=224, - feature_size=None, - in_chans=3, - embed_dim=768): - super().__init__() - assert isinstance(backbone, nn.Module) - img_size = to_2tuple(img_size) - self.img_size = img_size - self.backbone = backbone - if feature_size is None: - with torch.no_grad(): - # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature - # map for all networks, the feature metadata has reliable channel and stride info, but using - # stride to calc feature dim requires info about padding of each stage that isn't captured. - training = backbone.training - if training: - backbone.eval() - o = self.backbone( - torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] - feature_size = o.shape[-2:] - feature_dim = o.shape[1] - backbone.train(training) - else: - feature_size = to_2tuple(feature_size) - feature_dim = self.backbone.feature_info.channels()[-1] - self.num_patches = feature_size[0] * feature_size[1] - self.proj = nn.Linear(feature_dim, embed_dim) + self.use_residual_block = use_residual_block def forward(self, x): - x = self.backbone(x)[-1] - x = x.flatten(2).transpose(1, 2) - x = self.proj(x) + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + if self.use_residual_block: + x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + return x -class Norm2d(nn.Module): - - def __init__(self, embed_dim): - super().__init__() - self.ln = nn.LayerNorm(embed_dim, eps=1e-6) - - def forward(self, x): - x = x.permute(0, 2, 3, 1) - x = self.ln(x) - x = x.permute(0, 3, 1, 2).contiguous() - return x - - -# todo: refactor vitdet and vit_transformer_dynamic @BACKBONES.register_module() class ViTDet(nn.Module): - """ Vision Transformer with support for patch or hybrid CNN input stage + """ + This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. + "Exploring Plain Vision Transformer Backbones for Object Detection", + https://arxiv.org/abs/2203.16527 """ - def __init__(self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=80, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - hybrid_backbone=None, - norm_layer=None, - init_values=None, - use_checkpoint=False, - use_abs_pos_emb=False, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - out_indices=[11], - interval=3, - pretrained=None, - aggregation='attn'): + def __init__( + self, + img_size=1024, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + drop_path_rate=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + use_abs_pos=True, + use_rel_pos=False, + rel_pos_zero_init=True, + window_size=0, + window_block_indexes=(), + residual_block_indexes=(), + use_act_checkpoint=False, + pretrain_img_size=224, + pretrain_use_cls_token=True, + pretrained=None, + ): + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path_rate (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + window_block_indexes (list): Indexes for blocks using window attention. + residual_block_indexes (list): Indexes for blocks using conv propagation. + use_act_checkpoint (bool): If True, use activation checkpointing. + pretrain_img_size (int): input image size for pretraining models. + pretrain_use_cls_token (bool): If True, pretrainig models use class token. + """ super().__init__() - norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) - self.num_classes = num_classes - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.pretrain_use_cls_token = pretrain_use_cls_token + self.use_act_checkpoint = use_act_checkpoint - if hybrid_backbone is not None: - self.patch_embed = HybridEmbed( - hybrid_backbone, - img_size=img_size, - in_chans=in_chans, - embed_dim=embed_dim) - else: - self.patch_embed = PatchEmbed( - img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim) + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) - num_patches = self.patch_embed.num_patches - - self.out_indices = out_indices - - if use_abs_pos_emb: + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + num_patches = (pretrain_img_size // patch_size) * ( + pretrain_img_size // patch_size) + num_positions = (num_patches + + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter( - torch.zeros(1, num_patches, embed_dim)) + torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) - ] # stochastic depth decay rule - self.use_rel_pos_bias = use_rel_pos_bias - self.use_checkpoint = use_checkpoint - self.blocks = nn.ModuleList([ - Block( + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, - init_values=init_values, - window_size=(14, 14) if - ((i + 1) % interval != 0 - or aggregation != 'attn') else self.patch_embed.patch_shape, - window=((i + 1) % interval != 0), - aggregation=aggregation) for i in range(depth) - ]) + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i in window_block_indexes else 0, + use_residual_block=i in residual_block_indexes, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=.02) - - self.norm = norm_layer(embed_dim) + trunc_normal_(self.pos_embed, std=0.02) + self.apply(self._init_weights) self.pretrained = pretrained - self._register_load_state_dict_pre_hook(self._prepare_checkpoint_hook) - def fix_init_weight(self): - - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.mlp.fc2.weight.data, layer_id + 1) - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - self.fix_init_weight() - pretrained = pretrained or self.pretrained - - def _init_weights(m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) - if isinstance(m, nn.Conv2d): - kaiming_init(m, mode='fan_in', nonlinearity='relu') - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - - if isinstance(m, Bottleneck): - constant_init(m.norm3, 0) - elif isinstance(m, BasicBlock): - constant_init(m.norm2, 0) - - if isinstance(pretrained, str): - self.apply(_init_weights) + def init_weights(self): + if isinstance(self.pretrained, str): logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - self.apply(_init_weights) - else: - raise TypeError('pretrained must be a str or None') - - def _prepare_checkpoint_hook(self, state_dict, prefix, *args, **kwargs): - rank, _ = get_dist_info() - if 'pos_embed' in state_dict: - pos_embed_checkpoint = state_dict['pos_embed'] - embedding_size = pos_embed_checkpoint.shape[-1] - H, W = self.patch_embed.patch_shape - num_patches = self.patch_embed.num_patches - num_extra_tokens = 1 - # height (== width) for the checkpoint position embedding - orig_size = int( - (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - if rank == 0: - print('Position interpolate from %dx%d to %dx%d' % - (orig_size, orig_size, H, W)) - # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, - embedding_size).permute( - 0, 3, 1, 2) - pos_tokens = torch.nn.functional.interpolate( - pos_tokens, - size=(H, W), - mode='bicubic', - align_corners=False) - new_pos_embed = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) - # new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) - state_dict['pos_embed'] = new_pos_embed - - def get_num_layers(self): - return len(self.blocks) - - @torch.jit.ignore - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def forward_features(self, x): - B, C, H, W = x.shape - x, (Hp, Wp) = self.patch_embed(x) - batch_size, seq_len, _ = x.size() - - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - outs = [] - for i, blk in enumerate(self.blocks): - if self.use_checkpoint: - x = checkpoint.checkpoint(blk, x) - else: - x = blk(x, Hp, Wp) - - x = self.norm(x) - xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp) - - outs.append(xp) - - return tuple(outs) + load_checkpoint(self, self.pretrained, strict=False, logger=logger) def forward(self, x): - x = self.forward_features(x) - return x + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, + (x.shape[1], x.shape[2])) + + for blk in self.blocks: + if self.use_act_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + + outputs = [x.permute(0, 3, 1, 2)] + return outputs diff --git a/easycv/models/backbones/xcit_transformer.py b/easycv/models/backbones/xcit_transformer.py index 0ee9cf87..18722f1e 100644 --- a/easycv/models/backbones/xcit_transformer.py +++ b/easycv/models/backbones/xcit_transformer.py @@ -19,6 +19,7 @@ import torch.nn as nn from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from timm.models.vision_transformer import Mlp, _cfg +from easycv.framework.errors import ValueError from ..registry import BACKBONES @@ -109,7 +110,7 @@ class ConvPatchEmbed(nn.Module): conv3x3(embed_dim // 2, embed_dim, 2), ) else: - raise ( + raise ValueError( 'For convolutional projection, patch size has to be in [8, 16]' ) diff --git a/easycv/models/base.py b/easycv/models/base.py index 0385baf7..1f4c3278 100644 --- a/easycv/models/base.py +++ b/easycv/models/base.py @@ -8,6 +8,8 @@ import torch.distributed as dist import torch.nn as nn from torch import Tensor +from easycv.framework.errors import NotImplementedError, TypeError + class BaseModel(nn.Module, metaclass=ABCMeta): ''' base class for model. ''' diff --git a/easycv/models/classification/classification.py b/easycv/models/classification/classification.py index ddcc9e31..ccd30d50 100644 --- a/easycv/models/classification/classification.py +++ b/easycv/models/classification/classification.py @@ -7,6 +7,7 @@ import torch.nn as nn from mmcv.runner import get_dist_info from timm.data.mixup import Mixup +from easycv.framework.errors import KeyError, NotImplementedError, ValueError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger, print_log from easycv.utils.preprocess_function import (bninceptionPre, gaussianBlur, @@ -53,22 +54,15 @@ class Classification(BaseModel): if 'mixUp' in train_preprocess: rank, _ = get_dist_info() np.random.seed(rank + 12) - if not mixup_cfg: - num_classes = head.get( - 'num_classes', - 1000) if 'num_classes' in head else backbone.get( - 'num_classes', 1000) - mixup_cfg = dict( - mixup_alpha=0.8, - cutmix_alpha=1.0, - cutmix_minmax=None, - prob=1.0, - switch_prob=0.5, - mode='batch', - label_smoothing=0.1, - num_classes=num_classes) - self.mixup = Mixup(**mixup_cfg) - head.loss_config = {'type': 'SoftTargetCrossEntropy'} + if mixup_cfg is not None: + if 'num_classes' in mixup_cfg: + self.mixup = Mixup(**mixup_cfg) + elif 'num_classes' in head or 'num_classes' in backbone: + num_classes = head.get( + 'num_classes' + ) if 'num_classes' in head else backbone.get('num_classes') + mixup_cfg['num_classes'] = num_classes + self.mixup = Mixup(**mixup_cfg) train_preprocess.remove('mixUp') self.train_preprocess = [ self.preprocess_key_map[i] for i in train_preprocess @@ -173,7 +167,10 @@ class Classification(BaseModel): for preprocess in self.train_preprocess: img = preprocess(img) - if hasattr(self, 'mixup'): + # When the number of samples in the dataset is odd, the last batch size of each epoch will be odd, + # which will cause mixup to report an error. To avoid this situation, mixup is applied only when + # the batch size is even. + if hasattr(self, 'mixup') and len(img) % 2 == 0: img, gt_labels = self.mixup(img, gt_labels) x = self.forward_backbone(img) @@ -304,4 +301,4 @@ class Classification(BaseModel): rv['gt_labels'] = gt_labels.cpu() return rv else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) diff --git a/easycv/models/detection/detectors/dab_detr/attention.py b/easycv/models/detection/detectors/dab_detr/attention.py index ce4b6929..95b952cd 100644 --- a/easycv/models/detection/detectors/dab_detr/attention.py +++ b/easycv/models/detection/detectors/dab_detr/attention.py @@ -29,6 +29,8 @@ from torch.nn.init import constant_ from torch.nn.modules.linear import Linear from torch.nn.modules.module import Module +from easycv.framework.errors import RuntimeError + try: from torch.overrides import has_torch_function, handle_torch_function except: diff --git a/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py b/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py index 06345b11..1802f8e9 100644 --- a/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py +++ b/easycv/models/detection/detectors/dab_detr/dab_detr_transformer.py @@ -14,6 +14,7 @@ import torch import torch.nn.functional as F from torch import Tensor, nn +from easycv.framework.errors import NotImplementedError, ValueError from easycv.models.builder import NECKS from easycv.models.detection.utils import inverse_sigmoid from easycv.models.utils import (MLP, TransformerEncoder, diff --git a/easycv/models/detection/detectors/detection.py b/easycv/models/detection/detectors/detection.py index fe91fbf8..bcd8edf0 100644 --- a/easycv/models/detection/detectors/detection.py +++ b/easycv/models/detection/detectors/detection.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.framework.errors import ValueError from easycv.models.base import BaseModel from easycv.models.builder import (MODELS, build_backbone, build_head, build_neck) diff --git a/easycv/models/detection/detectors/dino/deformable_transformer.py b/easycv/models/detection/detectors/dino/deformable_transformer.py index 57d5f51d..447af568 100644 --- a/easycv/models/detection/detectors/dino/deformable_transformer.py +++ b/easycv/models/detection/detectors/dino/deformable_transformer.py @@ -15,6 +15,7 @@ from typing import Optional import torch from torch import Tensor, nn +from easycv.framework.errors import NotImplementedError from easycv.models.builder import NECKS from easycv.models.detection.utils import (gen_encoder_output_proposals, gen_sineembed_for_position, diff --git a/easycv/models/detection/detectors/dino/dino_head.py b/easycv/models/detection/detectors/dino/dino_head.py index bd581418..19ac173c 100644 --- a/easycv/models/detection/detectors/dino/dino_head.py +++ b/easycv/models/detection/detectors/dino/dino_head.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import NotImplementedError from easycv.models.builder import HEADS, build_neck from easycv.models.detection.utils import (DetrPostProcess, box_xyxy_to_cxcywh, inverse_sigmoid) diff --git a/easycv/models/detection/detectors/yolox/asff.py b/easycv/models/detection/detectors/yolox/asff.py index d4c62c3c..7af1ae84 100644 --- a/easycv/models/detection/detectors/yolox/asff.py +++ b/easycv/models/detection/detectors/yolox/asff.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import ValueError from easycv.models.backbones.network_blocks import BaseConv diff --git a/easycv/models/detection/detectors/yolox/yolo_head_template.py b/easycv/models/detection/detectors/yolox/yolo_head_template.py index 63923abf..a8e4fb03 100644 --- a/easycv/models/detection/detectors/yolox/yolo_head_template.py +++ b/easycv/models/detection/detectors/yolox/yolo_head_template.py @@ -8,6 +8,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import KeyError, RuntimeError from easycv.models.backbones.network_blocks import BaseConv, DWConv from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock from easycv.models.detection.utils import bboxes_iou diff --git a/easycv/models/detection/necks/fpn.py b/easycv/models/detection/necks/fpn.py index 6d14bbef..6f71eda0 100644 --- a/easycv/models/detection/necks/fpn.py +++ b/easycv/models/detection/necks/fpn.py @@ -3,6 +3,7 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule +from easycv.framework.errors import NotImplementedError from easycv.models.registry import NECKS @@ -37,7 +38,6 @@ class FPN(nn.Module): Default: None. upsample_cfg (dict): Config dict for interpolate layer. Default: dict(mode='nearest'). - init_cfg (dict or list[dict], optional): Initialization config dict. Example: >>> import torch >>> in_channels = [2, 3, 5, 7] @@ -67,8 +67,6 @@ class FPN(nn.Module): norm_cfg=None, act_cfg=None, upsample_cfg=dict(mode='nearest')): - # init_cfg=dict( - # type='Xavier', layer='Conv2d', distribution='uniform')): super(FPN, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels diff --git a/easycv/models/detection/necks/sfp.py b/easycv/models/detection/necks/sfp.py index be1273b0..62e581ea 100644 --- a/easycv/models/detection/necks/sfp.py +++ b/easycv/models/detection/necks/sfp.py @@ -2,26 +2,13 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule -from mmcv.runner import BaseModule +from easycv.framework.errors import NotImplementedError from easycv.models.builder import NECKS -class Norm2d(nn.Module): - - def __init__(self, embed_dim): - super().__init__() - self.ln = nn.LayerNorm(embed_dim, eps=1e-6) - - def forward(self, x): - x = x.permute(0, 2, 3, 1) - x = self.ln(x) - x = x.permute(0, 3, 1, 2).contiguous() - return x - - @NECKS.register_module() -class SFP(BaseModule): +class SFP(nn.Module): r"""Simple Feature Pyramid. This is an implementation of paper `Exploring Plain Vision Transformer Backbones for Object Detection `_. Args: @@ -32,25 +19,12 @@ class SFP(BaseModule): build the feature pyramid. Default: 0. end_level (int): Index of the end input backbone level (exclusive) to build the feature pyramid. Default: -1, which means the last level. - add_extra_convs (bool | str): If bool, it decides whether to add conv - layers on top of the original feature maps. Default to False. - If True, it is equivalent to `add_extra_convs='on_input'`. - If str, it specifies the source feature map of the extra convs. - Only the following options are allowed - - 'on_input': Last feat map of neck inputs (i.e. backbone feature). - - 'on_lateral': Last feature map after lateral convs. - - 'on_output': The last output feature map after fpn convs. - relu_before_extra_convs (bool): Whether to apply relu before the extra conv. Default: False. - no_norm_on_lateral (bool): Whether to apply norm on lateral. Default: False. conv_cfg (dict): Config dict for convolution layer. Default: None. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (str): Config dict for activation layer in ConvModule. Default: None. - upsample_cfg (dict): Config dict for interpolate layer. - Default: `dict(mode='nearest')` - init_cfg (dict or list[dict], optional): Initialization config dict. Example: >>> import torch >>> in_channels = [2, 3, 5, 7] @@ -70,158 +44,83 @@ class SFP(BaseModule): def __init__(self, in_channels, out_channels, + scale_factors, num_outs, - start_level=0, - end_level=-1, - add_extra_convs=False, - relu_before_extra_convs=False, - no_norm_on_lateral=False, conv_cfg=None, norm_cfg=None, - act_cfg=None, - upsample_cfg=dict(mode='nearest'), - init_cfg=[ - dict( - type='Xavier', - layer=['Conv2d'], - distribution='uniform'), - dict(type='Constant', layer=['LayerNorm'], val=1, bias=0) - ]): - super(SFP, self).__init__(init_cfg) - assert isinstance(in_channels, list) - self.in_channels = in_channels + act_cfg=None): + super(SFP, self).__init__() + dim = in_channels self.out_channels = out_channels - self.num_ins = len(in_channels) + self.scale_factors = scale_factors + self.num_ins = len(scale_factors) self.num_outs = num_outs - self.relu_before_extra_convs = relu_before_extra_convs - self.no_norm_on_lateral = no_norm_on_lateral - self.upsample_cfg = upsample_cfg.copy() - if end_level == -1: - self.backbone_end_level = self.num_ins - assert num_outs >= self.num_ins - start_level - else: - # if end_level < inputs, no extra level is allowed - self.backbone_end_level = end_level - assert end_level <= len(in_channels) - assert num_outs == end_level - start_level - self.start_level = start_level - self.end_level = end_level - self.add_extra_convs = add_extra_convs - assert isinstance(add_extra_convs, (str, bool)) - if isinstance(add_extra_convs, str): - # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' - assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') - elif add_extra_convs: # True - self.add_extra_convs = 'on_input' - - self.top_downs = nn.ModuleList() - self.lateral_convs = nn.ModuleList() - self.fpn_convs = nn.ModuleList() - - for i in range(self.start_level, self.backbone_end_level): - if i == 0: - top_down = nn.Sequential( + self.stages = [] + for idx, scale in enumerate(scale_factors): + out_dim = dim + if scale == 4.0: + layers = [ + nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0), + nn.GroupNorm(1, dim // 2, eps=1e-6), + nn.GELU(), nn.ConvTranspose2d( - in_channels[i], in_channels[i], 2, stride=2, - padding=0), Norm2d(in_channels[i]), nn.GELU(), - nn.ConvTranspose2d( - in_channels[i], in_channels[i], 2, stride=2, - padding=0)) - elif i == 1: - top_down = nn.ConvTranspose2d( - in_channels[i], in_channels[i], 2, stride=2, padding=0) - elif i == 2: - top_down = nn.Identity() - elif i == 3: - top_down = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + dim // 2, dim // 4, 2, stride=2, padding=0) + ] + out_dim = dim // 4 + elif scale == 2.0: + layers = [ + nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0) + ] + out_dim = dim // 2 + elif scale == 1.0: + layers = [] + elif scale == 0.5: + layers = [nn.MaxPool2d(kernel_size=2, stride=2, padding=0)] + else: + raise NotImplementedError( + f'scale_factor={scale} is not supported yet.') - l_conv = ConvModule( - in_channels[i], - out_channels, - 1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, - act_cfg=act_cfg, - inplace=False) - fpn_conv = ConvModule( - out_channels, - out_channels, - 3, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - inplace=False) - - self.top_downs.append(top_down) - self.lateral_convs.append(l_conv) - self.fpn_convs.append(fpn_conv) - - # add extra conv layers (e.g., RetinaNet) - extra_levels = num_outs - self.backbone_end_level + self.start_level - if self.add_extra_convs and extra_levels >= 1: - for i in range(extra_levels): - if i == 0 and self.add_extra_convs == 'on_input': - in_channels = self.in_channels[self.backbone_end_level - 1] - else: - in_channels = out_channels - extra_fpn_conv = ConvModule( - in_channels, + layers.extend([ + ConvModule( + out_dim, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False), + ConvModule( + out_channels, out_channels, 3, - stride=2, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) - self.fpn_convs.append(extra_fpn_conv) + ]) + + layers = nn.Sequential(*layers) + self.add_module(f'sfp_{idx}', layers) + self.stages.append(layers) + + def init_weights(self): + pass def forward(self, inputs): """Forward function.""" - assert len(inputs) == 1 + features = inputs[0] + outs = [] - # build top-down path - features = [ - top_down(inputs[0]) for _, top_down in enumerate(self.top_downs) - ] - assert len(features) == len(self.in_channels) + # part 1: build simple feature pyramid + for stage in self.stages: + outs.append(stage(features)) - # build laterals - laterals = [ - lateral_conv(features[i + self.start_level]) - for i, lateral_conv in enumerate(self.lateral_convs) - ] - - used_backbone_levels = len(laterals) - - # build outputs - # part 1: from original levels - outs = [ - self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) - ] # part 2: add extra levels - if self.num_outs > len(outs): + if self.num_outs > self.num_ins: # use max pool to get more levels on top of outputs # (e.g., Faster R-CNN, Mask R-CNN) - if not self.add_extra_convs: - for i in range(self.num_outs - used_backbone_levels): - outs.append(F.max_pool2d(outs[-1], 1, stride=2)) - # add conv layers on top of original feature maps (RetinaNet) - else: - if self.add_extra_convs == 'on_input': - extra_source = inputs[self.backbone_end_level - 1] - elif self.add_extra_convs == 'on_lateral': - extra_source = laterals[-1] - elif self.add_extra_convs == 'on_output': - extra_source = outs[-1] - else: - raise NotImplementedError - outs.append(self.fpn_convs[used_backbone_levels](extra_source)) - for i in range(used_backbone_levels + 1, self.num_outs): - if self.relu_before_extra_convs: - outs.append(self.fpn_convs[i](F.relu(outs[-1]))) - else: - outs.append(self.fpn_convs[i](outs[-1])) + for i in range(self.num_outs - self.num_ins): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) return tuple(outs) diff --git a/easycv/models/detection/utils/misc.py b/easycv/models/detection/utils/misc.py index a9605a3b..9a2de7a9 100644 --- a/easycv/models/detection/utils/misc.py +++ b/easycv/models/detection/utils/misc.py @@ -9,6 +9,8 @@ from packaging import version from torch import Tensor from torch.autograd import Function +from easycv.framework.errors import NotImplementedError + if version.parse(torchvision.__version__) < version.parse('0.7'): from torchvision.ops import _new_empty_tensor from torchvision.ops.misc import _output_size diff --git a/easycv/models/heads/cls_head.py b/easycv/models/heads/cls_head.py index 4b4654d6..ff3724d7 100644 --- a/easycv/models/heads/cls_head.py +++ b/easycv/models/heads/cls_head.py @@ -28,7 +28,8 @@ class ClsHead(nn.Module): }, input_feature_index=[0], init_cfg=dict( - type='Normal', layer='Linear', std=0.01, bias=0.)): + type='Normal', layer='Linear', std=0.01, bias=0.), + use_num_classes=True): super(ClsHead, self).__init__() self.with_avg_pool = with_avg_pool @@ -46,7 +47,8 @@ class ClsHead(nn.Module): 'label_smooth must be given as a float number in [0,1]' logger.info(f'=> Augment: using label smooth={self.label_smooth}') loss_config['label_smooth'] = label_smooth - loss_config['num_classes'] = num_classes + if use_num_classes: + loss_config['num_classes'] = num_classes self.criterion = build_from_cfg(loss_config, LOSSES) diff --git a/easycv/models/loss/cross_entropy_loss.py b/easycv/models/loss/cross_entropy_loss.py index 69e1f615..ad8661cb 100644 --- a/easycv/models/loss/cross_entropy_loss.py +++ b/easycv/models/loss/cross_entropy_loss.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import ValueError from easycv.models.builder import LOSSES from easycv.models.loss.utils import weight_reduce_loss @@ -115,6 +116,7 @@ def binary_cross_entropy(pred, class_weight=None, ignore_index=-100, avg_non_ignore=False, + label_ceil=False, **kwargs): """Calculate the binary CrossEntropy loss. @@ -132,11 +134,14 @@ def binary_cross_entropy(pred, avg_non_ignore (bool): The flag decides to whether the loss is only averaged over non-ignored targets. Default: False. `New in version 0.23.0.` + label_ceil (bool): When use bce and set label_ceil=True, + it will make elements belong to (0, 1] in label change to 1. + Default: False. Returns: torch.Tensor: The calculated loss """ - if len(pred.shape) > 1 and pred.shape(1) == 1: + if len(pred.shape) > 1 and pred.shape[1] == 1: # For binary class segmentation, the shape of pred is # [N, 1, H, W] and that of label is [N, H, W]. # As the ignore_index often set as 255, so the @@ -162,6 +167,8 @@ def binary_cross_entropy(pred, weight = weight * valid_mask else: weight = valid_mask + if label_ceil: + label = label.gt(0.0).type(label.dtype) # average loss over non-ignored and valid elements if reduction == 'mean' and avg_factor is None and avg_non_ignore: avg_factor = valid_mask.sum().item() @@ -234,6 +241,9 @@ class CrossEntropyLoss(nn.Module): avg_non_ignore (bool): The flag decides to whether the loss is only averaged over non-ignored targets. Default: False. `New in version 0.23.0.` + label_ceil (bool): When use bce and set label_ceil=True, + it will make elements belong to (0, 1] in label change to 1. + Default: False. """ def __init__(self, @@ -243,10 +253,16 @@ class CrossEntropyLoss(nn.Module): class_weight=None, loss_weight=1.0, loss_name='loss_ce', - avg_non_ignore=False): + avg_non_ignore=False, + label_ceil=False): super(CrossEntropyLoss, self).__init__() assert (use_sigmoid is False) or (use_mask is False) self.use_sigmoid = use_sigmoid + if label_ceil: + if not use_sigmoid: + raise ValueError( + '‘label_ceil’ is supported only when ‘use_sigmoid’ is true. If not use bce, please set ‘label_ceil’=False' + ) self.use_mask = use_mask self.reduction = reduction self.loss_weight = loss_weight @@ -266,6 +282,7 @@ class CrossEntropyLoss(nn.Module): else: self.cls_criterion = cross_entropy self._loss_name = loss_name + self.label_ceil = label_ceil def extra_repr(self): """Extra repr.""" @@ -289,16 +306,29 @@ class CrossEntropyLoss(nn.Module): else: class_weight = None # Note: for BCE loss, label < 0 is invalid. - loss_cls = self.loss_weight * self.cls_criterion( - cls_score, - label, - weight, - class_weight=class_weight, - reduction=reduction, - avg_factor=avg_factor, - avg_non_ignore=self.avg_non_ignore, - ignore_index=ignore_index, - **kwargs) + if self.use_sigmoid: + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + avg_non_ignore=self.avg_non_ignore, + ignore_index=ignore_index, + label_ceil=self.label_ceil, + **kwargs) + else: + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + avg_non_ignore=self.avg_non_ignore, + ignore_index=ignore_index, + **kwargs) return loss_cls @property diff --git a/easycv/models/loss/focal_loss.py b/easycv/models/loss/focal_loss.py index 0cec5ddb..f4ea5a47 100644 --- a/easycv/models/loss/focal_loss.py +++ b/easycv/models/loss/focal_loss.py @@ -4,6 +4,7 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss +from easycv.framework.errors import NotImplementedError from easycv.models.builder import LOSSES from easycv.models.loss.utils import weight_reduce_loss diff --git a/easycv/models/loss/iou_loss.py b/easycv/models/loss/iou_loss.py index 8a4af4bb..72611f02 100644 --- a/easycv/models/loss/iou_loss.py +++ b/easycv/models/loss/iou_loss.py @@ -7,6 +7,7 @@ import mmcv import torch import torch.nn as nn +from easycv.framework.errors import NotImplementedError from easycv.models.detection.utils import bbox_overlaps from easycv.models.loss.utils import weighted_loss from ..registry import LOSSES diff --git a/easycv/models/loss/utils.py b/easycv/models/loss/utils.py index b08e7cf3..0164b104 100644 --- a/easycv/models/loss/utils.py +++ b/easycv/models/loss/utils.py @@ -4,6 +4,8 @@ import functools import torch import torch.nn.functional as F +from easycv.framework.errors import ValueError + def reduce_loss(loss, reduction): """Reduce loss as specified. diff --git a/easycv/models/modelzoo.py b/easycv/models/modelzoo.py index 0680dd21..58f005c4 100644 --- a/easycv/models/modelzoo.py +++ b/easycv/models/modelzoo.py @@ -253,4 +253,10 @@ timm_models = { 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/timm/swin_small_patch4_window7_224_statedict.pth', 'dynamic_swin_tiny_p4_w7_224': 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/timm/swin_tiny_patch4_window7_224_statedict.pth', + + # dynamic_vit: + 'dynamic_vit_base_p16': + 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/timm/vit/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz', + 'dynamic_vit_large_p16': + 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/timm/vit/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz', } diff --git a/easycv/models/pose/heads/topdown_heatmap_base_head.py b/easycv/models/pose/heads/topdown_heatmap_base_head.py index adc3dfa6..afcfc8f3 100644 --- a/easycv/models/pose/heads/topdown_heatmap_base_head.py +++ b/easycv/models/pose/heads/topdown_heatmap_base_head.py @@ -7,6 +7,7 @@ import numpy as np import torch.nn as nn from easycv.core.evaluation.top_down_eval import keypoints_from_heatmaps +from easycv.framework.errors import ValueError class TopdownHeatmapBaseHead(nn.Module): diff --git a/easycv/models/pose/heads/topdown_heatmap_simple_head.py b/easycv/models/pose/heads/topdown_heatmap_simple_head.py index ba3c746b..8811d7eb 100644 --- a/easycv/models/pose/heads/topdown_heatmap_simple_head.py +++ b/easycv/models/pose/heads/topdown_heatmap_simple_head.py @@ -7,6 +7,7 @@ from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, from easycv.core.evaluation import pose_pck_accuracy from easycv.core.post_processing import flip_back +from easycv.framework.errors import TypeError, ValueError from easycv.models.builder import HEADS, build_loss from easycv.models.utils.ops import resize_tensor as resize from .topdown_heatmap_base_head import TopdownHeatmapBaseHead diff --git a/easycv/models/segmentation/encoder_decoder.py b/easycv/models/segmentation/encoder_decoder.py index 6b96c98f..63577528 100644 --- a/easycv/models/segmentation/encoder_decoder.py +++ b/easycv/models/segmentation/encoder_decoder.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import TypeError, ValueError from easycv.models import builder from easycv.models.base import BaseModel from easycv.models.builder import MODELS diff --git a/easycv/models/segmentation/heads/base.py b/easycv/models/segmentation/heads/base.py index f1508a25..3aaf85ff 100644 --- a/easycv/models/segmentation/heads/base.py +++ b/easycv/models/segmentation/heads/base.py @@ -7,6 +7,7 @@ import torch.nn as nn from mmcv.cnn.utils import initialize from easycv.core.evaluation.metrics import accuracy +from easycv.framework.errors import TypeError from easycv.models.builder import build_loss from easycv.models.utils.ops import resize_tensor from easycv.utils.logger import print_log diff --git a/easycv/models/segmentation/heads/transformer_decoder.py b/easycv/models/segmentation/heads/transformer_decoder.py index 3a42072d..88ef6303 100644 --- a/easycv/models/segmentation/heads/transformer_decoder.py +++ b/easycv/models/segmentation/heads/transformer_decoder.py @@ -5,6 +5,8 @@ import torch from torch import Tensor, nn from torch.nn import functional as F +from easycv.framework.errors import RuntimeError, ValueError + class PositionEmbeddingSine(nn.Module): """ diff --git a/easycv/models/segmentation/utils/criterion.py b/easycv/models/segmentation/utils/criterion.py index 490b2cb7..29345d12 100644 --- a/easycv/models/segmentation/utils/criterion.py +++ b/easycv/models/segmentation/utils/criterion.py @@ -8,6 +8,7 @@ import torchvision from mmcv.runner import get_dist_info from torch import Tensor, nn +from easycv.framework.errors import ValueError from .point_rend import (get_uncertain_point_coords_with_randomness, point_sample) diff --git a/easycv/models/selfsup/byol.py b/easycv/models/selfsup/byol.py index 44d42d10..9bdce95d 100644 --- a/easycv/models/selfsup/byol.py +++ b/easycv/models/selfsup/byol.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +from easycv.framework.errors import KeyError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from .. import builder @@ -97,4 +98,4 @@ class BYOL(BaseModel): elif mode == 'extract': return self.backbone(img) else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) diff --git a/easycv/models/selfsup/dino.py b/easycv/models/selfsup/dino.py index f9978974..1e21cad4 100644 --- a/easycv/models/selfsup/dino.py +++ b/easycv/models/selfsup/dino.py @@ -9,6 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.runner import get_dist_info +from easycv.framework.errors import KeyError, NotImplementedError, ValueError from easycv.utils.preprocess_function import (gaussianBlurDynamic, randomGrayScale, solarize) from .. import builder @@ -416,4 +417,4 @@ class DINO(BaseModel): # rv['gt_labels'] = gt_label.cpu() # return rv else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) diff --git a/easycv/models/selfsup/mae.py b/easycv/models/selfsup/mae.py index 9efb686c..de9d062a 100644 --- a/easycv/models/selfsup/mae.py +++ b/easycv/models/selfsup/mae.py @@ -1,5 +1,6 @@ import torch +from easycv.framework.errors import KeyError from .. import builder from ..base import BaseModel from ..registry import MODELS @@ -84,4 +85,4 @@ class MAE(BaseModel): elif mode == 'test': return self.forward_test(img, **kwargs) else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) diff --git a/easycv/models/selfsup/moby.py b/easycv/models/selfsup/moby.py index 8ed0e9e5..94ebb5c5 100644 --- a/easycv/models/selfsup/moby.py +++ b/easycv/models/selfsup/moby.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from easycv.framework.errors import KeyError, ValueError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale @@ -269,12 +270,14 @@ class MoBY(BaseModel): if name in rd.keys(): rv[name] = rd[name] else: - raise 'Extract %s is not support in classification models' % name + raise ValueError( + 'Extract %s is not support in classification models' % + name) if gt_label is not None: rv['gt_labels'] = gt_label.cpu() return rv else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) # utils diff --git a/easycv/models/selfsup/moco.py b/easycv/models/selfsup/moco.py index 5094f61d..e566f726 100644 --- a/easycv/models/selfsup/moco.py +++ b/easycv/models/selfsup/moco.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +from easycv.framework.errors import KeyError, ValueError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale @@ -232,12 +233,14 @@ class MOCO(BaseModel): if name in rd.keys(): rv[name] = rd[name] else: - raise 'Extract %s is not support in classification models' % name + raise ValueError( + 'Extract %s is not support in classification models' % + name) if gt_label is not None: rv['gt_labels'] = gt_label.cpu() return rv else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) # utils diff --git a/easycv/models/selfsup/simclr.py b/easycv/models/selfsup/simclr.py index 3b26eaf4..28e1b324 100644 --- a/easycv/models/selfsup/simclr.py +++ b/easycv/models/selfsup/simclr.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import torch +from easycv.framework.errors import KeyError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale @@ -97,4 +98,4 @@ class SimCLR(BaseModel): elif mode == 'extract': return self.forward_backbone(img) else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) diff --git a/easycv/models/selfsup/swav.py b/easycv/models/selfsup/swav.py index a7e8af4f..1393fc29 100644 --- a/easycv/models/selfsup/swav.py +++ b/easycv/models/selfsup/swav.py @@ -5,6 +5,7 @@ import torch.distributed as dist import torch.nn as nn from mmcv.runner import get_dist_info +from easycv.framework.errors import KeyError, ValueError from easycv.utils.checkpoint import load_checkpoint from easycv.utils.logger import get_root_logger from easycv.utils.preprocess_function import gaussianBlur, randomGrayScale @@ -193,12 +194,14 @@ class SWAV(BaseModel): if name in rd.keys(): rv[name] = rd[name] else: - raise 'Extract %s is not support in classification models' % name + raise ValueError( + 'Extract %s is not support in classification models' % + name) if gt_label is not None: rv['gt_labels'] = gt_label.cpu() return rv else: - raise Exception('No such mode: {}'.format(mode)) + raise KeyError('No such mode: {}'.format(mode)) class MultiPrototypes(nn.Module): diff --git a/easycv/models/utils/conv_module.py b/easycv/models/utils/conv_module.py index 26364a43..bfebf816 100644 --- a/easycv/models/utils/conv_module.py +++ b/easycv/models/utils/conv_module.py @@ -4,6 +4,7 @@ import warnings import torch.nn as nn from mmcv.cnn import constant_init, kaiming_init +from easycv.framework.errors import KeyError from .activation import build_activation_layer from .conv_ws import ConvWS2d from .norm import build_norm_layer diff --git a/easycv/models/utils/norm.py b/easycv/models/utils/norm.py index 85191b55..f5ca46e7 100644 --- a/easycv/models/utils/norm.py +++ b/easycv/models/utils/norm.py @@ -2,6 +2,8 @@ import torch import torch.nn as nn +from easycv.framework.errors import KeyError, NotImplementedError + class SyncIBN(nn.Module): r"""Instance-Batch Normalization layer from diff --git a/easycv/models/utils/transformer.py b/easycv/models/utils/transformer.py index e76fbb44..3cb19931 100644 --- a/easycv/models/utils/transformer.py +++ b/easycv/models/utils/transformer.py @@ -6,6 +6,8 @@ import torch.nn as nn import torch.nn.functional as F from torch import Tensor +from easycv.framework.errors import RuntimeError + class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" diff --git a/easycv/predictors/__init__.py b/easycv/predictors/__init__.py index 3fe86936..2d38f2f2 100644 --- a/easycv/predictors/__init__.py +++ b/easycv/predictors/__init__.py @@ -9,5 +9,4 @@ from .feature_extractor import (TorchFaceAttrExtractor, from .hand_keypoints_predictor import HandKeypointsPredictor from .pose_predictor import (TorchPoseTopDownPredictor, TorchPoseTopDownPredictorWithDetector) -from .segmentation import (Mask2formerPredictor, SegFormerPredictor, - SegmentationPredictor) +from .segmentation import Mask2formerPredictor, SegmentationPredictor diff --git a/easycv/predictors/base.py b/easycv/predictors/base.py index 49f3a728..5b36f2fd 100644 --- a/easycv/predictors/base.py +++ b/easycv/predictors/base.py @@ -1,18 +1,23 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import json import os import pickle +import cv2 import numpy as np import torch from mmcv.parallel import collate, scatter_kwargs from PIL import Image +from torch.hub import load_state_dict_from_url from torchvision.transforms import Compose from easycv.datasets.registry import PIPELINES from easycv.file import io +from easycv.file.utils import is_url_path +from easycv.framework.errors import ValueError from easycv.models.builder import build_model from easycv.utils.checkpoint import load_checkpoint -from easycv.utils.config_tools import mmcv_config_fromfile +from easycv.utils.config_tools import Config, mmcv_config_fromfile from easycv.utils.constant import CACHE_DIR from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab, remove_adapt_for_mmlab) @@ -106,7 +111,9 @@ class PredictorV2(object): device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. save_results (bool): Whether to save predict results. save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. """ + INPUT_IMAGE_MODE = 'BGR' # the image mode into the model def __init__(self, model_path, @@ -115,30 +122,51 @@ class PredictorV2(object): device=None, save_results=False, save_path=None, - mode='rgb', + pipelines=None, *args, **kwargs): self.model_path = model_path self.batch_size = batch_size self.save_results = save_results self.save_path = save_path + self.config_file = config_file if self.save_results: assert self.save_path is not None self.device = device if self.device is None: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.cfg = None if config_file is not None: if isinstance(config_file, str): self.cfg = mmcv_config_fromfile(config_file) else: self.cfg = config_file + else: + self.cfg = self._load_cfg_from_ckpt(self.model_path) + + if self.cfg is None: + raise ValueError('Please provide "config_file"!') self.model = self.prepare_model() + self.pipelines = pipelines self.processor = self.build_processor() self._load_op = None - self.mode = mode + + def _load_cfg_from_ckpt(self, model_path): + if is_url_path(model_path): + ckpt = load_state_dict_from_url(model_path) + else: + with io.open(model_path, 'rb') as infile: + ckpt = torch.load(infile, map_location='cpu') + + cfg = None + if 'meta' in ckpt and 'config' in ckpt['meta']: + cfg = ckpt['meta']['config'] + if isinstance(cfg, dict): + cfg = Config(cfg) + elif isinstance(cfg, str): + cfg = Config(json.loads(cfg)) + return cfg def prepare_model(self): """Build model from config file by default. @@ -151,8 +179,6 @@ class PredictorV2(object): return model def _build_model(self): - if self.cfg is None: - raise ValueError('Please provide "config_file"!') # Use mmdet model dynamic_adapt_for_mmlab(self.cfg) model = build_model(self.cfg.model) @@ -164,16 +190,15 @@ class PredictorV2(object): """Build processor to process loaded input. If you need custom preprocessing ops, you need to reimplement it. """ - if self.cfg is None: - pipeline = [] + if self.pipelines is not None: + pipelines = self.pipelines else: - pipeline = [ - build_from_cfg(p, PIPELINES) - for p in self.cfg.get('test_pipeline', []) - ] + pipelines = self.cfg.get('test_pipeline', []) + + pipelines = [build_from_cfg(p, PIPELINES) for p in pipelines] from easycv.datasets.shared.pipelines.transforms import Compose - processor = Compose(pipeline) + processor = Compose(pipelines) return processor def _load_input(self, input): @@ -189,10 +214,13 @@ class PredictorV2(object): } """ if self._load_op is None: - load_cfg = dict(type='LoadImage', mode=self.mode) + load_cfg = dict(type='LoadImage', mode=self.INPUT_IMAGE_MODE) self._load_op = build_from_cfg(load_cfg, PIPELINES) if not isinstance(input, str): + if isinstance(input, np.ndarray): + # Only support RGB mode if input is np.ndarray. + input = cv2.cvtColor(input, cv2.COLOR_RGB2BGR) sample = self._load_op({'img': input}) else: sample = self._load_op({'filename': input}) @@ -228,8 +256,32 @@ class PredictorV2(object): return outputs def postprocess(self, inputs, *args, **kwargs): - """Process model outputs. - If you need add some processing ops to process model outputs, you need to reimplement it. + """Process model batch outputs. + """ + outputs = [] + out_i = {} + batch_size = 1 + # get current batch size + for k, batch_v in inputs.items(): + if batch_v is not None: + batch_size = len(batch_v) + break + + for i in range(batch_size): + for k, batch_v in inputs.items(): + if batch_v is not None: + out_i[k] = batch_v[i] + else: + out_i[k] = None + + out_i = self.postprocess_single(out_i) + outputs.append(out_i) + + return outputs + + def postprocess_single(self, inputs): + """Process outputs of single sample. + If you need add some processing ops, you need to reimplement it. """ return inputs @@ -259,16 +311,22 @@ class PredictorV2(object): results_list = [] for i in range(0, len(inputs), self.batch_size): - batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)] + batch = inputs[i:min(len(inputs), i + self.batch_size)] batch_outputs = self.preprocess(batch) batch_outputs = self.forward(batch_outputs) results = self.postprocess(batch_outputs) + assert len(results) == len( + batch), f'Mismatch size {len(results)} != {len(batch)}' if keep_inputs: - results = {'inputs': batch, 'results': results} + for i in range(len(batch)): + results[i].update({'inputs': batch[i]}) # if dump, the outputs will not added to the return value to prevent taking up too much memory if self.save_results: - self.dump([results], self.save_path, mode='ab+') + self.dump(results, self.save_path, mode='ab+') else: - results_list.append(results) + if isinstance(results, list): + results_list.extend(results) + else: + results_list.append(results) return results_list diff --git a/easycv/predictors/classifier.py b/easycv/predictors/classifier.py index be3a9967..a788c354 100644 --- a/easycv/predictors/classifier.py +++ b/easycv/predictors/classifier.py @@ -3,16 +3,130 @@ import math import numpy as np import torch +from PIL import Image, ImageFile -from .base import Predictor +from easycv.file import io +from easycv.framework.errors import ValueError +from easycv.utils.misc import deprecated +from .base import Predictor, PredictorV2 from .builder import PREDICTORS + +@PREDICTORS.register_module() +class ClassificationPredictor(PredictorV2): + """Predictor for classification. + Args: + model_path (str): Path of model path. + config_file (Optinal[str]): config file path for model and processor to init. Defaults to None. + batch_size (int): batch size for forward. + device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. + save_results (bool): Whether to save predict results. + save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. + topk (int): Return top-k results. Default: 1. + pil_input (bool): Whether use PIL image. If processor need PIL input, set true, default false. + label_map_path (str): File path of saving labels list. + """ + + def __init__(self, + model_path, + config_file=None, + batch_size=1, + device=None, + save_results=False, + save_path=None, + pipelines=[], + topk=1, + pil_input=True, + label_map_path=[], + *args, + **kwargs): + super(ClassificationPredictor, self).__init__( + model_path, + config_file=config_file, + batch_size=batch_size, + device=device, + save_results=save_results, + save_path=save_path, + pipelines=pipelines, + *args, + **kwargs) + self.topk = topk + self.pil_input = pil_input + + # Adapt to torchvision transforms which process PIL inputs. + if self.pil_input: + self.INPUT_IMAGE_MODE = 'RGB' + + if label_map_path is None: + class_list = self.cfg.get('CLASSES', []) + else: + with io.open(label_map_path, 'r') as f: + class_list = f.readlines() + self.label_map = [i.strip() for i in class_list] + + def _load_input(self, input): + """Load image from file or numpy or PIL object. + Args: + input: File path or numpy or PIL object. + Returns: + { + 'filename': filename, + 'img': img, + 'img_shape': img_shape, + 'img_fields': ['img'] + } + """ + if self.pil_input: + results = {} + if isinstance(input, str): + img = Image.open(input) + if img.mode.upper() != self.INPUT_IMAGE_MODE.upper(): + img = img.convert(self.INPUT_IMAGE_MODE.upper()) + results['filename'] = input + else: + assert isinstance(input, ImageFile.ImageFile) + img = input + results['filename'] = None + results['img'] = img + results['img_shape'] = img.size + results['ori_shape'] = img.size + results['img_fields'] = ['img'] + return results + + return super()._load_input(input) + + def postprocess(self, inputs, *args, **kwargs): + """Return top-k results.""" + output_prob = inputs['prob'].data.cpu() + topk_class = torch.topk(output_prob, self.topk).indices.numpy() + output_prob = output_prob.numpy() + batch_results = [] + batch_size = output_prob.shape[0] + for i in range(batch_size): + result = {'class': np.squeeze(topk_class[i]).tolist()} + if isinstance(result['class'], int): + result['class'] = [result['class']] + + if len(self.label_map) > 0: + result['class_name'] = [ + self.label_map[i] for i in result['class'] + ] + result['class_probs'] = {} + for l_idx, l_name in enumerate(self.label_map): + result['class_probs'][l_name] = output_prob[i][l_idx] + + batch_results.append(result) + return batch_results + + try: from easy_vision.python.inference.predictor import PredictorInterface except: from .interface import PredictorInterface +@deprecated(reason='Please use ClassificationPredictor.') @PREDICTORS.register_module() class TorchClassifier(PredictorInterface): @@ -30,7 +144,7 @@ class TorchClassifier(PredictorInterface): """ self.predictor = Predictor(model_path) if 'class_list' not in self.predictor.cfg and label_map_path is None: - raise Exception( + raise ValueError( "label_map_path need to be set, when ckpt doesn't contain class_list" ) diff --git a/easycv/predictors/detector.py b/easycv/predictors/detector.py index f9d05992..38fd262f 100644 --- a/easycv/predictors/detector.py +++ b/easycv/predictors/detector.py @@ -5,9 +5,6 @@ from glob import glob import numpy as np import torch -from mmcv.ops import RoIPool -from mmcv.parallel import collate, scatter -from torch.hub import load_state_dict_from_url from torchvision.transforms import Compose from easycv.apis.export import reparameterize_models @@ -15,15 +12,12 @@ from easycv.core.visualization import imshow_bboxes from easycv.datasets.registry import PIPELINES from easycv.datasets.utils import replace_ImageToTensor from easycv.file import io -from easycv.file.utils import is_url_path, url_path_exists from easycv.models import build_model from easycv.models.detection.utils import postprocess from easycv.utils.checkpoint import load_checkpoint from easycv.utils.config_tools import mmcv_config_fromfile from easycv.utils.constant import CACHE_DIR -from easycv.utils.logger import get_root_logger -from easycv.utils.mmlab_utils import (dynamic_adapt_for_mmlab, - remove_adapt_for_mmlab) +from easycv.utils.misc import deprecated from easycv.utils.registry import build_from_cfg from .base import PredictorV2 from .builder import PREDICTORS @@ -46,14 +40,16 @@ class DetectionPredictor(PredictorV2): """ def __init__(self, - model_path=None, + model_path, config_file=None, batch_size=1, device=None, save_results=False, save_path=None, - mode='rgb', - score_threshold=0.5): + pipelines=None, + score_threshold=0.5, + *arg, + **kwargs): super(DetectionPredictor, self).__init__( model_path, config_file=config_file, @@ -61,207 +57,74 @@ class DetectionPredictor(PredictorV2): device=device, save_results=save_results, save_path=save_path, - mode=mode, + pipelines=pipelines, ) self.score_thresh = score_threshold + self.CLASSES = self.cfg.get('CLASSES', None) + + def build_processor(self): + if self.pipelines is not None: + pipelines = self.pipelines + elif self.cfg is None: + pipelines = [] + else: + pipelines = self.cfg.get('test_pipeline', []) + + # for batch inference + self.pipelines = replace_ImageToTensor(pipelines) + + return super().build_processor() + + def postprocess_single(self, inputs, *args, **kwargs): + if inputs['detection_scores'] is None or len( + inputs['detection_scores']) < 1: + return inputs + + scores = inputs['detection_scores'] + if scores is not None and self.score_thresh > 0: + keeped_ids = scores > self.score_thresh + inputs['detection_scores'] = inputs['detection_scores'][keeped_ids] + inputs['detection_boxes'] = inputs['detection_boxes'][keeped_ids] + inputs['detection_classes'] = inputs['detection_classes'][ + keeped_ids] + + class_names = [] + for _, classes_id in enumerate(inputs['detection_classes']): + if classes_id is None: + class_names.append(None) + elif self.CLASSES is not None and len(self.CLASSES) > 0: + class_names.append(self.CLASSES[int(classes_id)]) + else: + class_names.append(classes_id) + + inputs['detection_class_names'] = class_names - def postprocess(self, inputs, *args, **kwargs): - for batch_index in range(self.batch_size): - this_detection_scores = inputs['detection_scores'][batch_index] - sel_ids = this_detection_scores > self.score_thresh - inputs['detection_scores'][batch_index] = inputs[ - 'detection_scores'][batch_index][sel_ids] - inputs['detection_boxes'][batch_index] = inputs['detection_boxes'][ - batch_index][sel_ids] - inputs['detection_classes'][batch_index] = inputs[ - 'detection_classes'][batch_index][sel_ids] - # TODO class label remapping return inputs - -class DetrPredictor(PredictorInterface): - """Inference image(s) with the detector. - Args: - model_path (str): checkpoint model and export model are shared. - config_path (str): If config_path is specified, both checkpoint model and export model can be used; if config_path=None, the export model is used by default. - """ - - def __init__(self, model_path, config_path=None): - - self.model_path = model_path - - if config_path is not None: - self.cfg = mmcv_config_fromfile(config_path) - else: - logger = get_root_logger() - logger.warning('please use export model!') - if is_url_path(self.model_path) and url_path_exists( - self.model_path): - checkpoint = load_state_dict_from_url(model_path) - else: - assert io.exists( - self.model_path), f'{self.model_path} does not exists' - - with io.open(self.model_path, 'rb') as infile: - checkpoint = torch.load(infile, map_location='cpu') - - assert 'meta' in checkpoint and 'config' in checkpoint[ - 'meta'], 'meta.config is missing from checkpoint' - - config_str = checkpoint['meta']['config'] - if isinstance(config_str, dict): - config_str = json.dumps(config_str) - - # get config - basename = os.path.basename(self.model_path) - fname, _ = os.path.splitext(basename) - self.local_config_file = os.path.join(CACHE_DIR, - f'{fname}_config.json') - if not os.path.exists(CACHE_DIR): - os.makedirs(CACHE_DIR) - with open(self.local_config_file, 'w') as ofile: - ofile.write(config_str) - self.cfg = mmcv_config_fromfile(self.local_config_file) - - # dynamic adapt mmdet models - dynamic_adapt_for_mmlab(self.cfg) - - # build model - self.model = build_model(self.cfg.model) - - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - map_location = 'cpu' if self.device == 'cpu' else 'cuda' - self.ckpt = load_checkpoint( - self.model, self.model_path, map_location=map_location) - - self.model.to(self.device) - self.model.eval() - - self.CLASSES = self.cfg.CLASSES - - def predict(self, imgs): - """ - Args: - imgs (str/ndarray or list[str/ndarray] or tuple[str/ndarray]): - Either image files or loaded images. - Returns: - If imgs is a list or tuple, the same length list type results - will be returned, otherwise return the detection results directly. - """ - - if isinstance(imgs, (list, tuple)): - is_batch = True - else: - imgs = [imgs] - is_batch = False - - cfg = self.cfg - device = next(self.model.parameters()).device # model device - - if isinstance(imgs[0], np.ndarray): - cfg = cfg.copy() - # set loading pipeline type - cfg.data.val.pipeline.insert(0, dict(type='LoadImageFromWebcam')) - else: - cfg = cfg.copy() - # set loading pipeline type - cfg.data.val.pipeline.insert( - 0, - dict( - type='LoadImageFromFile', - file_client_args=dict( - backend=('http' if imgs[0].startswith('http' - ) else 'disk')))) - - cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline) - - transforms = [] - for transform in cfg.data.val.pipeline: - if 'img_scale' in transform: - transform['img_scale'] = tuple(transform['img_scale']) - if isinstance(transform, dict): - transform = build_from_cfg(transform, PIPELINES) - transforms.append(transform) - elif callable(transform): - transforms.append(transform) - else: - raise TypeError('transform must be callable or a dict') - test_pipeline = Compose(transforms) - - datas = [] - for img in imgs: - # prepare data - if isinstance(img, np.ndarray): - # directly add img - data = dict(img=img) - else: - # add information into dict - data = dict(img_info=dict(filename=img), img_prefix=None) - # build the data pipeline - data = test_pipeline(data) - datas.append(data) - - data = collate(datas, samples_per_gpu=len(imgs)) - # just get the actual data from DataContainer - data['img_metas'] = [ - img_metas.data[0] for img_metas in data['img_metas'] - ] - data['img'] = [img.data[0] for img in data['img']] - if next(self.model.parameters()).is_cuda: - # scatter to specified GPU - data = scatter(data, [device])[0] - else: - for m in self.model.modules(): - assert not isinstance( - m, RoIPool - ), 'CPU inference with RoIPool is not supported currently.' - - # forward the model - with torch.no_grad(): - results = self.model(mode='test', **data) - - return results - - def visualize(self, - img, - results, - score_thr=0.3, - show=False, - out_file=None): - bboxes = results['detection_boxes'][0] - scores = results['detection_scores'][0] - labels = results['detection_classes'][0].tolist() - - # If self.CLASSES is not None, class_id will be converted to self.CLASSES for visualization, - # otherwise the class_id will be displayed. - # And don't try to modify the value in results, it may cause some bugs or even precision problems, - # because `self.evaluate` will also use the results, refer to: https://github.com/alibaba/EasyCV/pull/67 - - if self.CLASSES is not None and len(self.CLASSES) > 0: - for i, classes_id in enumerate(labels): - if classes_id is None: - labels[i] = None - else: - labels[i] = self.CLASSES[int(classes_id)] - - if scores is not None and score_thr > 0: - inds = scores > score_thr - bboxes = bboxes[inds] - labels = np.array(labels)[inds] - + def visualize(self, img, results, show=False, out_file=None): + """Only support show one sample now.""" + bboxes = results['detection_boxes'] + labels = results['detection_class_names'] + img = self._load_input(img)['img'] imshow_bboxes( img, bboxes, labels=labels, - colors='green', - text_color='white', - font_size=20, - thickness=1, - font_scale=0.5, + colors='cyan', + text_color='cyan', + font_size=18, + thickness=2, + font_scale=0.0, show=show, out_file=out_file) +@deprecated(reason='Please use DetectionPredictor.') +@PREDICTORS.register_module() +class DetrPredictor(DetectionPredictor): + """""" + + @PREDICTORS.register_module() class TorchYoloXPredictor(PredictorInterface): diff --git a/easycv/predictors/face_keypoints_predictor.py b/easycv/predictors/face_keypoints_predictor.py index 2c94f0a4..54b13424 100644 --- a/easycv/predictors/face_keypoints_predictor.py +++ b/easycv/predictors/face_keypoints_predictor.py @@ -25,6 +25,11 @@ class FaceKeypointsPredictor(PredictorV2): Args: model_path (str): Path of model path config_file (str): config file path for model and processor to init. Defaults to None. + batch_size (int): batch size for forward. + device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. + save_results (bool): Whether to save predict results. + save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. """ def __init__(self, @@ -34,7 +39,7 @@ class FaceKeypointsPredictor(PredictorV2): device=None, save_results=False, save_path=None, - mode='bgr'): + pipelines=None): super(FaceKeypointsPredictor, self).__init__( model_path, config_file, @@ -42,7 +47,7 @@ class FaceKeypointsPredictor(PredictorV2): device=device, save_results=save_results, save_path=save_path, - mode=mode) + pipelines=pipelines) self.input_size = self.cfg.IMAGE_SIZE self.point_number = self.cfg.POINT_NUMBER diff --git a/easycv/predictors/feature_extractor.py b/easycv/predictors/feature_extractor.py index 79110bb9..fc0802af 100644 --- a/easycv/predictors/feature_extractor.py +++ b/easycv/predictors/feature_extractor.py @@ -6,6 +6,7 @@ import numpy as np import torch from PIL import Image +from easycv.framework.errors import ValueError from .base import Predictor from .builder import PREDICTORS @@ -522,7 +523,7 @@ class TorchFaceAttrExtractor(PredictorInterface): neck_output.device) neck_output = (distribute * neck_output).sum(dim=1) else: - raise Exception( + raise ValueError( 'TorchFaceAttrExtractor for neck %d only support attr_method softmax/distributed sum' % (neck_idx)) neck_output = torch.argmax(neck_output, dim=1) diff --git a/easycv/predictors/hand_keypoints_predictor.py b/easycv/predictors/hand_keypoints_predictor.py index 01d0b0ce..0e092488 100644 --- a/easycv/predictors/hand_keypoints_predictor.py +++ b/easycv/predictors/hand_keypoints_predictor.py @@ -25,9 +25,11 @@ class HandKeypointsPredictor(PredictorV2): config_file: path or ``Config`` of config file detection_model_config: dict of hand detection model predictor config, example like ``dict(type="", model_path="", config_file="", ......)`` - batch_size: batch_size to infer - save_results: bool - save_path: path of result image + batch_size (int): batch size for forward. + device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. + save_results (bool): Whether to save predict results. + save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. """ def __init__(self, @@ -38,7 +40,7 @@ class HandKeypointsPredictor(PredictorV2): device=None, save_results=False, save_path=None, - mode='rgb', + pipelines=None, *args, **kwargs): super(HandKeypointsPredictor, self).__init__( @@ -48,7 +50,7 @@ class HandKeypointsPredictor(PredictorV2): device=device, save_results=save_results, save_path=save_path, - mode=mode, + pipelines=pipelines, *args, **kwargs) self.dataset_info = DatasetInfo(COCO_WHOLEBODY_HAND_DATASET_INFO) @@ -70,52 +72,48 @@ class HandKeypointsPredictor(PredictorV2): } } """ - image_paths = input['inputs'] - batch_data = [] + image_path = input['inputs'] + data_list = [] box_id = 0 - for batch_index, image_path in enumerate(image_paths): - det_bbox_result = input['results']['detection_boxes'][batch_index] - det_bbox_scores = input['results']['detection_scores'][batch_index] - img = mmcv.imread(image_path, 'color', self.mode) - for bbox, score in zip(det_bbox_result, det_bbox_scores): - center, scale = _box2cs(self.cfg.data_cfg['image_size'], bbox) - # prepare data - data = { - 'image_file': - image_path, - 'img': - img, - 'image_id': - batch_index, - 'center': - center, - 'scale': - scale, - 'bbox_score': - score, - 'bbox_id': - box_id, # need to be assigned if batch_size > 1 - 'dataset': - 'coco_wholebody_hand', - 'joints_3d': - np.zeros((self.cfg.data_cfg.num_joints, 3), - dtype=np.float32), - 'joints_3d_visible': - np.zeros((self.cfg.data_cfg.num_joints, 3), - dtype=np.float32), - 'rotation': - 0, - 'flip_pairs': - self.dataset_info.flip_pairs, - 'ann_info': { - 'image_size': - np.array(self.cfg.data_cfg['image_size']), - 'num_joints': self.cfg.data_cfg['num_joints'] - } + det_bbox_result = input['detection_boxes'] + det_bbox_scores = input['detection_scores'] + img = mmcv.imread(image_path, 'color', self.INPUT_IMAGE_MODE) + for bbox, score in zip(det_bbox_result, det_bbox_scores): + center, scale = _box2cs(self.cfg.data_cfg['image_size'], bbox) + # prepare data + data = { + 'image_file': + image_path, + 'img': + img, + 'image_id': + 0, + 'center': + center, + 'scale': + scale, + 'bbox_score': + score, + 'bbox_id': + box_id, # need to be assigned if batch_size > 1 + 'dataset': + 'coco_wholebody_hand', + 'joints_3d': + np.zeros((self.cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'joints_3d_visible': + np.zeros((self.cfg.data_cfg.num_joints, 3), dtype=np.float32), + 'rotation': + 0, + 'flip_pairs': + self.dataset_info.flip_pairs, + 'ann_info': { + 'image_size': np.array(self.cfg.data_cfg['image_size']), + 'num_joints': self.cfg.data_cfg['num_joints'] } - batch_data.append(data) - box_id += 1 - return batch_data + } + data_list.append(data) + box_id += 1 + return data_list def preprocess_single(self, input): results = [] @@ -128,8 +126,11 @@ class HandKeypointsPredictor(PredictorV2): """Process all inputs list. And collate to batch and put to target device. If you need custom ops to load or process a batch samples, you need to reimplement it. """ + # hand det and return source image + det_results = self.detection_predictor(inputs, keep_inputs=True) + batch_outputs = [] - for i in inputs: + for i in det_results: for res in self.preprocess_single(i, *args, **kwargs): batch_outputs.append(res) batch_outputs = self._collate_fn(batch_outputs) @@ -137,37 +138,25 @@ class HandKeypointsPredictor(PredictorV2): return batch_outputs def postprocess(self, inputs, *args, **kwargs): - output = {} - output['keypoints'] = inputs['preds'] - output['boxes'] = inputs['boxes'] - for i, bbox in enumerate(output['boxes']): + keypoints = inputs['preds'] + boxes = inputs['boxes'] + for i, bbox in enumerate(boxes): center, scale = bbox[:2], bbox[2:4] - output['boxes'][i][:4] = bbox_cs2xyxy(center, scale) - output['boxes'] = output['boxes'][:, :4] - return output - - def __call__(self, inputs, keep_inputs=False): - if isinstance(inputs, str): - inputs = [inputs] - - results_list = [] - for i in range(0, len(inputs), self.batch_size): - batch = inputs[i:max(len(inputs) - 1, i + self.batch_size)] - # hand det and return source image - det_results = self.detection_predictor(batch, keep_inputs=True) - # hand keypoints - batch_outputs = self.preprocess(det_results) - batch_outputs = self.forward(batch_outputs) - results = self.postprocess(batch_outputs) - if keep_inputs: - results = {'inputs': batch, 'results': results} - # if dump, the outputs will not added to the return value to prevent taking up too much memory - if self.save_results: - self.dump([results], self.save_path, mode='ab+') - else: - results_list.append(results) - - return results_list + boxes[i][:4] = bbox_cs2xyxy(center, scale) + boxes = boxes[:, :4] + # TODO: support multi bboxes for a single sample + assert len(keypoints.shape) == 3 + assert len(boxes.shape) == 2 + batch_outputs = [] + batch_size = keypoints.shape[0] + keypoints = np.split(keypoints, batch_size) + boxes = np.split(boxes, batch_size) + for i in range(batch_size): + batch_outputs.append({ + 'keypoints': keypoints[i], + 'boxes': boxes[i] + }) + return batch_outputs def show_result(self, image_path, diff --git a/easycv/predictors/pose_predictor.py b/easycv/predictors/pose_predictor.py index 34ca5475..b2408051 100644 --- a/easycv/predictors/pose_predictor.py +++ b/easycv/predictors/pose_predictor.py @@ -11,6 +11,7 @@ from torchvision.transforms import Compose from easycv.datasets.pose.data_sources.top_down import DatasetInfo from easycv.datasets.registry import PIPELINES from easycv.file import io +from easycv.framework.errors import ModuleNotFoundError, TypeError, ValueError from easycv.models import build_model from easycv.predictors.builder import PREDICTORS from easycv.predictors.detector import TorchYoloXPredictor diff --git a/easycv/predictors/segmentation.py b/easycv/predictors/segmentation.py index 6916817b..51365653 100644 --- a/easycv/predictors/segmentation.py +++ b/easycv/predictors/segmentation.py @@ -5,22 +5,25 @@ import numpy as np import torch from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon -from torchvision.transforms import Compose from easycv.core.visualization.image import imshow_bboxes -from easycv.datasets.registry import PIPELINES -from easycv.file import io -from easycv.models import build_model from easycv.predictors.builder import PREDICTORS -from easycv.predictors.interface import PredictorInterface -from easycv.utils.checkpoint import load_checkpoint -from easycv.utils.config_tools import mmcv_config_fromfile -from easycv.utils.registry import build_from_cfg from .base import PredictorV2 @PREDICTORS.register_module() class SegmentationPredictor(PredictorV2): + """Predictor for Segmentation. + + Args: + model_path (str): Path of model path. + config_file (Optinal[str]): config file path for model and processor to init. Defaults to None. + batch_size (int): batch size for forward. + device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. + save_results (bool): Whether to save predict results. + save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. + """ def __init__(self, model_path, @@ -28,20 +31,21 @@ class SegmentationPredictor(PredictorV2): batch_size=1, device=None, save_results=False, - save_path=None): - """Predict pipeline for Segmentation + save_path=None, + pipelines=None, + *args, + **kwargs): - Args: - model_path (str): Path of model path - config_file (str): config file path for model and processor to init. Defaults to None. - """ super(SegmentationPredictor, self).__init__( model_path, config_file, batch_size=batch_size, device=device, save_results=save_results, - save_path=save_path) + save_path=save_path, + pipelines=pipelines, + *args, + **kwargs) self.CLASSES = self.cfg.CLASSES self.PALETTE = self.cfg.PALETTE @@ -123,71 +127,61 @@ class SegmentationPredictor(PredictorV2): @PREDICTORS.register_module() -class Mask2formerPredictor(PredictorInterface): +class Mask2formerPredictor(SegmentationPredictor): + """Predictor for Mask2former. - def __init__(self, model_path, model_config=None): - """init model + Args: + model_path (str): Path of model path. + config_file (Optinal[str]): config file path for model and processor to init. Defaults to None. + batch_size (int): batch size for forward. + device (str): Support 'cuda' or 'cpu', if is None, detect device automatically. + save_results (bool): Whether to save predict results. + save_path (str): File path for saving results, only valid when `save_results` is True. + pipelines (list[dict]): Data pipeline configs. + """ - Args: - model_path (str): Path of model path - model_config (config, optional): config string for model to init. Defaults to None. + def __init__(self, + model_path, + config_file=None, + batch_size=1, + device=None, + save_results=False, + save_path=None, + pipelines=None, + task_mode='panoptic', + *args, + **kwargs): + super(Mask2formerPredictor, self).__init__( + model_path, + config_file, + batch_size=batch_size, + device=device, + save_results=save_results, + save_path=save_path, + pipelines=pipelines, + *args, + **kwargs) + self.task_mode = task_mode + + def forward(self, inputs): + """Model forward. """ - self.model_path = model_path + with torch.no_grad(): + outputs = self.model(**inputs, mode='test', encode=False) + return outputs - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.model = None - with io.open(self.model_path, 'rb') as infile: - checkpoint = torch.load(infile, map_location='cpu') - - assert 'meta' in checkpoint and 'config' in checkpoint[ - 'meta'], 'meta.config is missing from checkpoint' - - self.cfg = checkpoint['meta']['config'] - self.classes = len(self.cfg.PALETTE) - self.class_name = self.cfg.CLASSES - # build model - self.model = build_model(self.cfg.model) - - self.ckpt = load_checkpoint( - self.model, self.model_path, map_location=self.device) - self.model.to(self.device) - self.model.eval() - - # build pipeline - test_pipeline = self.cfg.test_pipeline - pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline] - self.pipeline = Compose(pipeline) - - def predict(self, input_data_list, mode='panoptic'): - """ - Args: - input_data_list: a list of numpy array(in rgb order), each array is a sample - to be predicted - """ - output_list = [] - for idx, img in enumerate(input_data_list): - output = {} - if not isinstance(img, np.ndarray): - img = np.asarray(img) - data_dict = {'img': img} - ori_shape = img.shape - data_dict = self.pipeline(data_dict) - img = data_dict['img'] - img[0] = torch.unsqueeze(img[0], 0).to(self.device) - img_metas = [[ - img_meta._data for img_meta in data_dict['img_metas'] - ]] - img_metas[0][0]['ori_shape'] = ori_shape - res = self.model.forward_test(img, img_metas, encode=False) - if mode == 'panoptic': - output['pan'] = res['pan_results'][0] - elif mode == 'instance': - output['segms'] = res['detection_masks'][0] - output['bboxes'] = res['detection_boxes'][0] - output['scores'] = res['detection_scores'][0] - output['labels'] = res['detection_classes'][0] - output_list.append(output) - return output_list + def postprocess(self, inputs): + output = {} + if self.task_mode == 'panoptic': + output['pan'] = inputs['pan_results'][0] + elif self.task_mode == 'instance': + output['segms'] = inputs['detection_masks'][0] + output['bboxes'] = inputs['detection_boxes'][0] + output['scores'] = inputs['detection_scores'][0] + output['labels'] = inputs['detection_classes'][0] + else: + raise ValueError(f'Not support model {self.task_mode}') + return output def show_panoptic(self, img, pan_mask): pan_label = np.unique(pan_mask) @@ -214,147 +208,6 @@ class Mask2formerPredictor(PredictorInterface): return instance_result -@PREDICTORS.register_module() -class SegFormerPredictor(PredictorInterface): - - def __init__(self, model_path, model_config): - """init model - - Args: - model_path (str): Path of model path - model_config (config): config string for model to init. Defaults to None. - """ - self.model_path = model_path - - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.model = None - with io.open(self.model_path, 'rb') as infile: - checkpoint = torch.load(infile, map_location='cpu') - - self.cfg = mmcv_config_fromfile(model_config) - self.CLASSES = self.cfg.CLASSES - self.PALETTE = self.cfg.PALETTE - # build model - self.model = build_model(self.cfg.model) - - self.ckpt = load_checkpoint( - self.model, self.model_path, map_location=self.device) - self.model.to(self.device) - self.model.eval() - - # build pipeline - test_pipeline = self.cfg.test_pipeline - pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline] - self.pipeline = Compose(pipeline) - - def predict(self, input_data_list): - """ - using session run predict a number of samples using batch_size - - Args: - input_data_list: a list of numpy array(in rgb order), each array is a sample - to be predicted - use a fixed number if you do not want to adjust batch_size in runtime - """ - output_list = [] - for idx, img in enumerate(input_data_list): - if type(img) is not np.ndarray: - img = np.asarray(img) - - ori_img_shape = img.shape[:2] - - data_dict = {'img': img} - data_dict['ori_shape'] = ori_img_shape - data_dict = self.pipeline(data_dict) - img = data_dict['img'] - img = torch.unsqueeze(img[0], 0).to(self.device) - data_dict.pop('img') - - with torch.no_grad(): - out = self.model([img], - mode='test', - img_metas=[[data_dict['img_metas'][0]._data]]) - - output_list.append(out) - - return output_list - - def show_result(self, - img, - result, - palette=None, - win_name='', - show=False, - wait_time=0, - out_file=None, - opacity=0.5): - """Draw `result` over `img`. - - Args: - img (str or Tensor): The image to be displayed. - result (Tensor): The semantic segmentation results to draw over - `img`. - palette (list[list[int]]] | np.ndarray | None): The palette of - segmentation map. If None is given, random palette will be - generated. Default: None - win_name (str): The window name. - wait_time (int): Value of waitKey param. - Default: 0. - show (bool): Whether to show the image. - Default: False. - out_file (str or None): The filename to write the image. - Default: None. - opacity(float): Opacity of painted segmentation map. - Default 0.5. - Must be in (0, 1] range. - Returns: - img (Tensor): Only if not `show` or `out_file` - """ - - img = mmcv.imread(img) - img = img.copy() - seg = result[0] - if palette is None: - if self.PALETTE is None: - # Get random state before set seed, - # and restore random state later. - # It will prevent loss of randomness, as the palette - # may be different in each iteration if not specified. - # See: https://github.com/open-mmlab/mmdetection/issues/5844 - state = np.random.get_state() - np.random.seed(42) - # random palette - palette = np.random.randint( - 0, 255, size=(len(self.CLASSES), 3)) - np.random.set_state(state) - else: - palette = self.PALETTE - palette = np.array(palette) - assert palette.shape[0] == len(self.CLASSES) - assert palette.shape[1] == 3 - assert len(palette.shape) == 2 - assert 0 < opacity <= 1.0 - color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) - for label, color in enumerate(palette): - color_seg[seg == label, :] = color - # convert to BGR - color_seg = color_seg[..., ::-1] - - img = img * (1 - opacity) + color_seg * opacity - img = img.astype(np.uint8) - # if out_file specified, do not show image in window - if out_file is not None: - show = False - - if show: - mmcv.imshow(img, win_name, wait_time) - if out_file is not None: - mmcv.imwrite(img, out_file) - - if not (show or out_file): - return img - - def _get_bias_color(base, max_dist=30): """Get different colors for each masks. diff --git a/easycv/runner/ev_runner.py b/easycv/runner/ev_runner.py index d8b9cb14..7921808b 100644 --- a/easycv/runner/ev_runner.py +++ b/easycv/runner/ev_runner.py @@ -8,6 +8,7 @@ from mmcv.runner import EpochBasedRunner from mmcv.runner.log_buffer import LogBuffer from easycv.file import io +from easycv.framework.errors import RuntimeError, TypeError from easycv.utils.checkpoint import load_checkpoint, save_checkpoint if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'): diff --git a/easycv/toolkit/blade/cv_blade_utils.py b/easycv/toolkit/blade/cv_blade_utils.py index 0bfcb8f4..cd742161 100644 --- a/easycv/toolkit/blade/cv_blade_utils.py +++ b/easycv/toolkit/blade/cv_blade_utils.py @@ -17,6 +17,8 @@ import torch_blade.tensorrt import torchvision from torch_blade import optimize +from easycv.framework.errors import RuntimeError + os.environ['DISC_ENABLE_STITCH'] = os.environ.get('DISC_ENABLE_STITCH', 'true') os.environ['DISC_EXPERIMENTAL_SPECULATION_TLP_ENHANCE'] = os.environ.get( 'DISC_EXPERIMENTAL_SPECULATION_TLP_ENHANCE', 'true') @@ -103,13 +105,13 @@ def opt_trt_config( def cu_prof_start(): ret = _cudart.cudaProfilerStart() if ret != 0: - raise Exception('cudaProfilerStart() returned %d' % ret) + raise RuntimeError('cudaProfilerStart() returned %d' % ret) def cu_prof_stop(): ret = _cudart.cudaProfilerStop() if ret != 0: - raise Exception('cudaProfilerStop() returned %d' % ret) + raise RuntimeError('cudaProfilerStop() returned %d' % ret) @contextmanager diff --git a/easycv/toolkit/prune/prune_utils.py b/easycv/toolkit/prune/prune_utils.py index b9fb2aa2..48f05350 100644 --- a/easycv/toolkit/prune/prune_utils.py +++ b/easycv/toolkit/prune/prune_utils.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.framework.errors import ValueError + try: from nni.algorithms.compression.pytorch.pruning import AGPPrunerV2 except ImportError: @@ -83,7 +85,7 @@ def load_pruner(model, optimizer=optimizer, pruning_algorithm=pruning_algorithm) else: - raise Exception( + raise ValueError( 'pruning class {} is not supported'.format(pruning_class)) return pruner diff --git a/easycv/toolkit/quantize/quantize_utils.py b/easycv/toolkit/quantize/quantize_utils.py index c7ef8aa8..41ce1ac1 100644 --- a/easycv/toolkit/quantize/quantize_utils.py +++ b/easycv/toolkit/quantize/quantize_utils.py @@ -7,6 +7,7 @@ import numpy as np import torch from mmcv.parallel import scatter_kwargs +from easycv.framework.errors import ValueError from easycv.models.detection.detectors.yolox.yolo_head import YOLOXHead from easycv.models.detection.utils import output_postprocess, postprocess diff --git a/easycv/toolkit/torchacc/convert_ops.py b/easycv/toolkit/torchacc/convert_ops.py index 2e3d69ee..c17f898c 100644 --- a/easycv/toolkit/torchacc/convert_ops.py +++ b/easycv/toolkit/torchacc/convert_ops.py @@ -10,6 +10,8 @@ import torchacc.torch_xla.core.xla_model as xm from prettytable import PrettyTable from torch.distributed import ReduceOp +from easycv.framework.errors import ValueError + DEFAULT_TAG = 'EasyCV-default-barrier-tag' OpSpec = namedtuple('OpSpec', ['module', 'name', 'value']) diff --git a/easycv/utils/checkpoint.py b/easycv/utils/checkpoint.py index 4bf0af60..4c987c83 100644 --- a/easycv/utils/checkpoint.py +++ b/easycv/utils/checkpoint.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import logging import os import torch @@ -8,6 +9,8 @@ from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu from torch.optim import Optimizer from easycv.file import io +from easycv.file.utils import is_url_path +from easycv.framework.errors import TypeError from easycv.utils.constant import CACHE_DIR @@ -31,28 +34,40 @@ def load_checkpoint(model, Returns: dict or OrderedDict: The loaded checkpoint. """ - if not filename.startswith('oss://'): - return mmcv_load_checkpoint( - model, - filename, - map_location=map_location, - strict=strict, - logger=logger) - else: + if filename.startswith('oss://'): _, fname = os.path.split(filename) cache_file = os.path.join(CACHE_DIR, fname) + if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) if not os.path.exists(cache_file): - print(f'download checkpoint from {filename} to {cache_file}') + logging.info( + f'download checkpoint from {filename} to {cache_file}') io.copy(filename, cache_file) if torch.distributed.is_available( ) and torch.distributed.is_initialized(): torch.distributed.barrier() - return mmcv_load_checkpoint( - model, - cache_file, - map_location=map_location, - strict=strict, - logger=logger) + filename = cache_file + elif is_url_path(filename): + from torch.hub import urlparse, download_url_to_file + parts = urlparse(filename) + base_name = os.path.basename(parts.path) + cache_file = os.path.join(CACHE_DIR, base_name) + if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) + if not os.path.exists(cache_file): + logging.info( + f'download checkpoint from {filename} to {cache_file}') + download_url_to_file(filename, cache_file) + if torch.distributed.is_available( + ) and torch.distributed.is_initialized(): + torch.distributed.barrier() + filename = cache_file + return mmcv_load_checkpoint( + model, + filename, + map_location=map_location, + strict=strict, + logger=logger) def save_checkpoint(model, filename, optimizer=None, meta=None): diff --git a/easycv/utils/collect.py b/easycv/utils/collect.py index 887ac417..904a44c9 100644 --- a/easycv/utils/collect.py +++ b/easycv/utils/collect.py @@ -5,6 +5,7 @@ import mmcv import numpy as np import torch +from easycv.framework.errors import ValueError from .gather import gather_tensors_batch diff --git a/easycv/utils/config_tools.py b/easycv/utils/config_tools.py index adc846df..90386c89 100644 --- a/easycv/utils/config_tools.py +++ b/easycv/utils/config_tools.py @@ -7,6 +7,7 @@ from importlib import import_module from mmcv import Config, import_modules_from_strings +from easycv.framework.errors import IOError, KeyError, ValueError from .user_config_params_utils import check_value_type if platform.system() == 'Windows': @@ -58,7 +59,7 @@ def check_base_cfg_path(base_cfg_name='configs/base.py', ori_filename=None): if osp.exists(base_cfg_path_3): return base_cfg_path_3 - raise '%s not Found' % base_cfg_name + raise ValueError('%s not Found' % base_cfg_name) # Read config without __base__ @@ -69,7 +70,7 @@ def mmcv_file2dict_raw(ori_filename): # read configs/config_templates/detection_oss.py filename = check_base_cfg_path(ori_filename) else: - raise '%s and %s not Found' % (ori_filename, filename) + raise ValueError('%s and %s not Found' % (ori_filename, filename)) fileExtname = osp.splitext(filename)[1] if fileExtname not in ['.py', '.json', '.yaml', '.yml']: diff --git a/easycv/utils/constant.py b/easycv/utils/constant.py index 981a8bbb..87afc813 100644 --- a/easycv/utils/constant.py +++ b/easycv/utils/constant.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -CACHE_DIR = '.easycv_cache' +import os + +CACHE_DIR = os.path.expanduser('~/.cache/easycv/') MAX_READ_IMAGE_TRY_TIMES = 10 diff --git a/easycv/utils/json_utils.py b/easycv/utils/json_utils.py index 05dacacb..536966d6 100644 --- a/easycv/utils/json_utils.py +++ b/easycv/utils/json_utils.py @@ -23,6 +23,8 @@ from json import encoder import numpy as np +from easycv.framework.errors import ValueError + # python 3.5 and newer version does not have json.encoder.FLOAT_REPR needs_class_hack = sys.version_info >= (3, 5) diff --git a/easycv/utils/logger.py b/easycv/utils/logger.py index 0c785311..9183af1e 100644 --- a/easycv/utils/logger.py +++ b/easycv/utils/logger.py @@ -3,6 +3,8 @@ import logging from mmcv.utils import get_logger +from easycv.framework.errors import TypeError + def get_root_logger(log_file=None, log_level=logging.INFO): """Get the root logger. diff --git a/easycv/utils/misc.py b/easycv/utils/misc.py index 8e544b96..cce21293 100644 --- a/easycv/utils/misc.py +++ b/easycv/utils/misc.py @@ -1,12 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import functools +import inspect import logging +import warnings from functools import partial import mmcv import numpy as np -from six.moves import map, zip - -from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): @@ -79,6 +79,8 @@ def reparameterize_models(model): Args: model: nn.Module """ + from easycv.models.backbones.repvgg_yolox_backbone import RepVGGBlock + reparameterize_count = 0 for layer in model.modules(): if isinstance(layer, RepVGGBlock): @@ -89,3 +91,31 @@ def reparameterize_models(model): .format(reparameterize_count)) print('reparam:', reparameterize_count) return model + + +def deprecated(reason): + """ + This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emitted + when the function is used. + """ + + def decorator(func1): + if inspect.isclass(func1): + fmt1 = 'Call to deprecated class {name} ({reason}).' + else: + fmt1 = 'Call to deprecated function {name} ({reason}).' + + @functools.wraps(func1) + def new_func1(*args, **kwargs): + warnings.simplefilter('always', DeprecationWarning) + warnings.warn( + fmt1.format(name=func1.__name__, reason=reason), + category=DeprecationWarning, + stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) + return func1(*args, **kwargs) + + return new_func1 + + return decorator diff --git a/easycv/utils/mmlab_utils.py b/easycv/utils/mmlab_utils.py index 17899d08..e4e2df86 100644 --- a/easycv/utils/mmlab_utils.py +++ b/easycv/utils/mmlab_utils.py @@ -11,6 +11,7 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule +from easycv.framework.errors import TypeError, ValueError from easycv.models.registry import BACKBONES, HEADS, MODELS, NECKS from .test_util import run_in_subprocess @@ -159,7 +160,7 @@ class MMAdapter: elif inspect.isclass(module_name): module_obj = module_name else: - raise ValueError( + raise TypeError( 'Only support type `str` and `class` object, but get type {}'. format(type(module_name))) return module_obj diff --git a/easycv/utils/registry.py b/easycv/utils/registry.py index b9e19b7d..19d9ab5d 100644 --- a/easycv/utils/registry.py +++ b/easycv/utils/registry.py @@ -4,6 +4,8 @@ from functools import partial import mmcv +from easycv.framework.errors import KeyError, TypeError + class Registry(object): diff --git a/easycv/utils/test_util.py b/easycv/utils/test_util.py index 169b12e8..ddd7245d 100644 --- a/easycv/utils/test_util.py +++ b/easycv/utils/test_util.py @@ -18,6 +18,7 @@ import numpy as np import torch from easycv.file import io +from easycv.framework.errors import RuntimeError TEST_DIR = '/tmp/ev_pytorch_test' diff --git a/easycv/utils/user_config_params_utils.py b/easycv/utils/user_config_params_utils.py index 45d2772d..558741c9 100644 --- a/easycv/utils/user_config_params_utils.py +++ b/easycv/utils/user_config_params_utils.py @@ -1,3 +1,5 @@ +from easycv.framework.errors import TypeError + VALID_TYPES = {tuple, list, str, int, float, bool, type(None)} diff --git a/tests/core/evaluation/test_coco_evaluation.py b/tests/core/evaluation/test_coco_evaluation.py index badf94a9..7ea3a706 100644 --- a/tests/core/evaluation/test_coco_evaluation.py +++ b/tests/core/evaluation/test_coco_evaluation.py @@ -21,6 +21,7 @@ import numpy as np from easycv.core import standard_fields from easycv.core.evaluation import coco_evaluation +from easycv.framework.errors import ValueError class CocoDetectionEvaluationTest(unittest.TestCase): diff --git a/tests/core/optimizer/test_optimizers.py b/tests/core/optimizer/test_optimizers.py index fa569020..3a1c538e 100644 --- a/tests/core/optimizer/test_optimizers.py +++ b/tests/core/optimizer/test_optimizers.py @@ -9,6 +9,8 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR from torch.testing._internal.common_utils import TestCase +from easycv.framework.errors import ValueError + @unittest.skipIf( LooseVersion(torch.__version__) < LooseVersion('1.6.0'), diff --git a/tests/datasets/detection/data_sources/test_det_voc_datasource.py b/tests/datasets/detection/data_sources/test_det_voc_datasource.py index 8e016415..cb409c59 100644 --- a/tests/datasets/detection/data_sources/test_det_voc_datasource.py +++ b/tests/datasets/detection/data_sources/test_det_voc_datasource.py @@ -8,6 +8,7 @@ from tests.ut_config import DET_DATA_SMALL_VOC_LOCAL, VOC_CLASSES from easycv.datasets.detection.data_sources.voc import DetSourceVOC from easycv.file import io +from easycv.framework.errors import ValueError class DetSourceVOCTest(unittest.TestCase): @@ -135,7 +136,7 @@ class DetSourceVOCTest(unittest.TestCase): self.assertEqual(num_samples, 20) self.assertEqual(data_source._retry_count, 2) - self.assertEqual(exception.args[0], 'All samples failed to load!') + self.assertEqual(exception.message, 'All samples failed to load!') if __name__ == '__main__': diff --git a/tests/datasets/segmentation/test_seg_raw_dataset.py b/tests/datasets/segmentation/test_seg_raw_dataset.py index 9ad7d489..b9e5a628 100644 --- a/tests/datasets/segmentation/test_seg_raw_dataset.py +++ b/tests/datasets/segmentation/test_seg_raw_dataset.py @@ -7,9 +7,7 @@ from tests.ut_config import (IMG_NORM_CFG_255, SEG_DATA_SMALL_RAW_LOCAL, from easycv.core.evaluation.builder import build_evaluator from easycv.datasets.builder import build_datasource -from easycv.datasets.segmentation.data_sources.raw import SegSourceRaw from easycv.datasets.segmentation.raw import SegDataset -from easycv.file import io class SegDatasetTest(unittest.TestCase): diff --git a/tests/file/test_file_io.py b/tests/file/test_file_io.py index e67f75ed..31929f0c 100644 --- a/tests/file/test_file_io.py +++ b/tests/file/test_file_io.py @@ -7,7 +7,8 @@ import unittest import uuid from tests.ut_config import (BASE_LOCAL_PATH, CLS_DATA_NPY_LOCAL, - CLS_DATA_NPY_OSS, IO_DATA_TXTX_OSS, TMP_DIR_OSS) + CLS_DATA_NPY_OSS, IO_DATA_MULTI_DIRS_OSS, + IO_DATA_TXTX_OSS, TMP_DIR_OSS) from easycv.file import io @@ -128,6 +129,28 @@ class IOForOSSTest(unittest.TestCase): io.remove(temp_dir) io.remove(oss_file_path2) + def test_copytree_multi_dirs(self): + target = [ + 'dir1/a.txt', 'dir1/dir1_1/a.txt', 'dir1/dir1_1/b.txt', + 'dir2/b.txt' + ] + # test copy dir from oss to local + oss_file_path1 = IO_DATA_MULTI_DIRS_OSS + temp_dir = tempfile.TemporaryDirectory().name + io.copytree(oss_file_path1, temp_dir) + self.assertTrue(io.exists(temp_dir)) + self.assertCountEqual(io.listdir(temp_dir, recursive=True), target) + + # test copy dir from local to oss + oss_file_path2 = os.path.join(TMP_DIR_OSS, '%s' % uuid.uuid4().hex) + io.copytree(temp_dir, oss_file_path2) + self.assertTrue(io.exists(oss_file_path2)) + self.assertCountEqual( + io.listdir(oss_file_path2, recursive=True), target) + + io.remove(temp_dir) + io.remove(oss_file_path2) + def test_listdir(self): # with suffix / files = io.listdir(IO_DATA_TXTX_OSS.rstrip('/') + '/') diff --git a/tests/framework/__init__.py b/tests/framework/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/framework/test_errors.py b/tests/framework/test_errors.py new file mode 100644 index 00000000..a01d3290 --- /dev/null +++ b/tests/framework/test_errors.py @@ -0,0 +1,52 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + + +class ErrorsTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + def test_errors(self): + from easycv.framework import errors + + def dummy_op(): + pass + + with self.assertRaises(errors.ValueError) as cm: + raise errors.ValueError( + 'value error', details='provide correct value', op=dummy_op) + value_exception = cm.exception + self.assertEqual(value_exception.error_code, hex(errors.INVALID_VALUE)) + self.assertEqual(value_exception.op, dummy_op) + self.assertEqual(value_exception.details, 'provide correct value') + self.assertEqual(value_exception.message, 'value error') + + with self.assertRaises(errors.NotImplementedError) as cm: + raise errors.NotImplementedError() + value_exception = cm.exception + self.assertEqual(value_exception.error_code, hex(errors.UNIMPLEMENTED)) + self.assertEqual(value_exception.op, None) + self.assertEqual(value_exception.details, None) + self.assertEqual(value_exception.message, '') + + with self.assertRaises(errors.FileNotFoundError) as cm: + raise errors.FileNotFoundError + value_exception = cm.exception + self.assertEqual(value_exception.error_code, + hex(errors.FILE_NOT_FOUND)) + self.assertEqual(value_exception.op, None) + self.assertEqual(value_exception.details, None) + self.assertEqual(value_exception.message, '') + + with self.assertRaises(errors.TimeoutError) as cm: + raise errors.TimeoutError('time out') + value_exception = cm.exception + self.assertEqual(value_exception.error_code, hex(errors.TIMEOUT)) + self.assertEqual(value_exception.op, None) + self.assertEqual(value_exception.details, None) + self.assertEqual(value_exception.message, 'time out') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/models/backbones/test_deitiii.py b/tests/models/backbones/test_deitiii.py new file mode 100644 index 00000000..f95f6ea5 --- /dev/null +++ b/tests/models/backbones/test_deitiii.py @@ -0,0 +1,42 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import numpy as np +import torch +from numpy.testing import assert_array_almost_equal + + +class DeiTIIITest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + @unittest.skip('skip DeiT III unittest') + def test_deitiii(self): + model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/classification/deitiii/epoch_800.pth' + config_path = 'configs/classification/imagenet/vit/imagenet_deitiii_large_patch16_192_jpg.py' + img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/deitiii_demo.JPEG' + # deitiii = ClsPredictor(model_path, config_path) + deitiii = [] + output = deitiii.predict(img) + + self.assertIn('prob', output) + self.assertIn('class', output) + self.assertEqual(len(output['prob'][0]), 1000) + + assert_array_almost_equal( + output['prob'][0][:10], + torch.Tensor([ + 2.04629918698628899e-06, 5.27398606209317222e-06, + 5.52915162188583054e-06, 3.60625563189387321e-06, + 3.29447357216849923e-06, 5.61309570912271738e-06, + 8.93703327164985240e-06, 4.89157764604897238e-06, + 4.39371024185675196e-06, 5.21611764270346612e-06 + ]), + decimal=8) + + self.assertEqual(int(output['class']), 948) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/models/backbones/test_vitdet.py b/tests/models/backbones/test_vitdet.py index 3f0350a2..82012aed 100644 --- a/tests/models/backbones/test_vitdet.py +++ b/tests/models/backbones/test_vitdet.py @@ -14,18 +14,27 @@ class ViTDetTest(unittest.TestCase): def test_vitdet(self): model = ViTDet( img_size=1024, + patch_size=16, embed_dim=768, depth=12, num_heads=12, + drop_path_rate=0.1, + window_size=14, mlp_ratio=4, qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.1, - use_abs_pos_emb=True, - aggregation='attn', - ) + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True) model.init_weights() model.train() diff --git a/tests/predictors/test_classifier.py b/tests/predictors/test_classifier.py index 8aef4778..c546dfe6 100644 --- a/tests/predictors/test_classifier.py +++ b/tests/predictors/test_classifier.py @@ -8,14 +8,57 @@ import unittest import cv2 import torch - -from easycv.predictors.classifier import TorchClassifier - +from easycv.predictors.builder import build_predictor from easycv.utils.test_util import clean_up, get_tmp_dir +from easycv.utils.config_tools import mmcv_config_fromfile from tests.ut_config import (PRETRAINED_MODEL_RESNET50_WITHOUTHEAD, IMAGENET_LABEL_TXT, TEST_IMAGES_DIR) +class ClassificationPredictorTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + def test_single(self): + checkpoint = PRETRAINED_MODEL_RESNET50_WITHOUTHEAD + config_file = 'configs/classification/imagenet/resnet/imagenet_resnet50_jpg.py' + cfg = mmcv_config_fromfile(config_file) + predict_op = build_predictor( + dict( + **cfg.predict, + model_path=checkpoint, + config_file=config_file, + label_map_path=IMAGENET_LABEL_TXT)) + img_path = os.path.join(TEST_IMAGES_DIR, 'catb.jpg') + + results = predict_op([img_path])[0] + self.assertListEqual(results['class'], [283]) + self.assertListEqual(results['class_name'], ['"Persian cat",']) + self.assertEqual(len(results['class_probs']), 1000) + + def test_batch(self): + checkpoint = PRETRAINED_MODEL_RESNET50_WITHOUTHEAD + config_file = 'configs/classification/imagenet/resnet/imagenet_resnet50_jpg.py' + cfg = mmcv_config_fromfile(config_file) + predict_op = build_predictor( + dict( + **cfg.predict, + model_path=checkpoint, + config_file=config_file, + label_map_path=IMAGENET_LABEL_TXT, + batch_size=3)) + img_path = os.path.join(TEST_IMAGES_DIR, 'catb.jpg') + + num_imgs = 4 + results = predict_op([img_path] * num_imgs) + self.assertEqual(len(results), num_imgs) + for res in results: + self.assertListEqual(res['class'], [283]) + self.assertListEqual(res['class_name'], ['"Persian cat",']) + self.assertEqual(len(res['class_probs']), 1000) + + class TorchClassifierTest(unittest.TestCase): def setUp(self): @@ -62,6 +105,8 @@ class TorchClassifierTest(unittest.TestCase): output_ckpt = f'{self.tmp_dir}/export.pth' torch.save(output_dict, output_ckpt) + from easycv.predictors.classifier import TorchClassifier + fe = TorchClassifier( output_ckpt, topk=topk, label_map_path=IMAGENET_LABEL_TXT) diff --git a/tests/predictors/test_detector.py b/tests/predictors/test_detector.py index 9187d3a7..1b160a01 100644 --- a/tests/predictors/test_detector.py +++ b/tests/predictors/test_detector.py @@ -4,11 +4,11 @@ isort:skip_file """ import os import unittest - +import tempfile import numpy as np from PIL import Image -from easycv.predictors.detector import TorchYoloXPredictor, DetrPredictor +from easycv.predictors.detector import TorchYoloXPredictor, DetectionPredictor from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT, PRETRAINED_MODEL_YOLOXS_EXPORT_OLD, PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_JIT, @@ -154,78 +154,204 @@ class DetectorTest(unittest.TestCase): [510.37033, 268.4982, 527.67017, 273.04935]]), decimal=1) - def test_vitdet_detector(self): - model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn_export.pth' - img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg' - out_file = './result.jpg' - vitdet = DetrPredictor(model_path) - output = vitdet.predict(img) - vitdet.visualize(img, output, out_file=out_file) - + def _detection_detector_assert(self, output): self.assertIn('detection_boxes', output) self.assertIn('detection_scores', output) self.assertIn('detection_classes', output) self.assertIn('detection_masks', output) self.assertIn('img_metas', output) - self.assertEqual(len(output['detection_boxes'][0]), 30) - self.assertEqual(len(output['detection_scores'][0]), 30) - self.assertEqual(len(output['detection_classes'][0]), 30) + self.assertEqual(len(output['detection_boxes']), 33) + self.assertEqual(len(output['detection_scores']), 33) + self.assertEqual(len(output['detection_classes']), 33) self.assertListEqual( - output['detection_classes'][0].tolist(), + output['detection_classes'].tolist(), np.array([ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 7, 7, 13, 13, 13, 56 + 2, 2, 2, 2, 2, 2, 7, 7, 13, 13, 13, 56 ], dtype=np.int32).tolist()) assert_array_almost_equal( - output['detection_scores'][0], + output['detection_scores'], np.array([ - 0.99791867, 0.99665856, 0.99480623, 0.99060905, 0.9882515, - 0.98319584, 0.9738879, 0.97290784, 0.9514897, 0.95104814, - 0.9321701, 0.86165, 0.8228847, 0.7623552, 0.76129806, - 0.6050861, 0.44348577, 0.3452973, 0.2895671, 0.22109479, - 0.21265312, 0.17855245, 0.1205352, 0.08981906, 0.10596471, - 0.05854294, 0.99749386, 0.9472857, 0.5945908, 0.09855112 + 0.9975854158401489, 0.9965696334838867, 0.9922919869422913, + 0.9833580851554871, 0.983080267906189, 0.970454752445221, + 0.9701289534568787, 0.9649872183799744, 0.9642795324325562, + 0.9642238020896912, 0.9529680609703064, 0.9403366446495056, + 0.9391788244247437, 0.8941807150840759, 0.8178097009658813, + 0.8013413548469543, 0.6677654385566711, 0.3952914774417877, + 0.33463895320892334, 0.32501447200775146, 0.27323535084724426, + 0.20197080075740814, 0.15607696771621704, 0.1068163588643074, + 0.10183875262737274, 0.09735643863677979, 0.06559795141220093, + 0.08890066295862198, 0.076363705098629, 0.9954648613929749, + 0.9212945699691772, 0.5224372148513794, 0.20555885136127472 ], dtype=np.float32), decimal=2) assert_array_almost_equal( - output['detection_boxes'][0], - np.array([[294.7058, 117.29371, 378.83713, 149.99928], - [609.05444, 112.526474, 633.2971, 136.35175], - [481.4165, 110.987335, 522.5531, 130.01529], - [167.68184, 109.89049, 215.49057, 139.86987], - [374.75082, 110.68697, 433.10028, 136.23654], - [189.54971, 110.09322, 297.6167, 155.77412], - [266.5185, 105.37718, 326.54385, 127.916374], - [556.30225, 110.43166, 592.8248, 128.03764], - [432.49252, 105.086464, 484.0512, 132.272], - [0., 110.566444, 62.01249, 146.44017], - [591.74664, 110.43527, 619.73816, 126.68549], - [99.126854, 90.947975, 118.46699, 101.11096], - [59.895264, 94.110054, 85.60521, 106.67633], - [142.95819, 96.61966, 165.96964, 104.95929], - [83.062515, 89.802605, 99.1546, 98.69074], - [226.28802, 98.32568, 249.06772, 108.86408], - [136.67789, 94.75706, 154.62924, 104.289536], - [170.42459, 98.458694, 183.16309, 106.203156], - [67.56731, 89.68286, 82.62955, 98.35645], - [222.80092, 97.828445, 239.02655, 108.29377], - [134.34427, 92.31653, 149.19615, 102.97457], - [613.5186, 102.27066, 636.0434, 112.813644], - [607.4787, 110.87984, 630.1123, 127.65646], - [135.13664, 90.989876, 155.67192, 100.18036], - [431.61505, 105.43844, 484.36508, 132.50078], - [189.92722, 110.38832, 297.74353, 155.95557], - [220.67035, 177.13489, 455.32092, 380.45712], - [372.76584, 134.33807, 432.44357, 188.51534], - [50.403812, 110.543495, 70.4368, 119.65186], - [373.50272, 134.27258, 432.18475, 187.81824]]), + output['detection_boxes'], + np.array([[ + 294.22674560546875, 116.6078109741211, 379.4328918457031, + 150.14097595214844 + ], + [ + 482.6017761230469, 110.75955963134766, + 522.8798828125, 129.71286010742188 + ], + [ + 167.06460571289062, 109.95974731445312, + 212.83975219726562, 140.16102600097656 + ], + [ + 609.2930908203125, 113.13909149169922, + 637.3115844726562, 136.4690704345703 + ], + [ + 191.185791015625, 111.1408920288086, 301.31689453125, + 155.7731170654297 + ], + [ + 431.2244873046875, 106.19962310791016, + 483.860595703125, 132.21627807617188 + ], + [ + 267.48358154296875, 105.5920639038086, + 325.2832336425781, 127.11176300048828 + ], + [ + 591.2138671875, 110.29329681396484, + 619.8524169921875, 126.1990966796875 + ], + [ + 0.0, 110.7026596069336, 61.487945556640625, + 146.33018493652344 + ], + [ + 555.9155883789062, 110.03486633300781, + 591.7050170898438, 127.06097412109375 + ], + [ + 60.24559783935547, 94.12760162353516, + 85.63741302490234, 106.66705322265625 + ], + [ + 99.02665710449219, 90.53657531738281, + 118.83953094482422, 101.18717956542969 + ], + [ + 396.30438232421875, 111.59194946289062, + 431.559814453125, 133.96914672851562 + ], + [ + 83.81543731689453, 89.65665435791016, + 99.9166259765625, 98.25627899169922 + ], + [ + 139.29647827148438, 96.68000793457031, + 165.22410583496094, 105.60000610351562 + ], + [ + 67.27152252197266, 89.42798614501953, + 83.25617980957031, 98.0460205078125 + ], + [ + 223.74176025390625, 98.68321990966797, + 250.42506408691406, 109.32588958740234 + ], + [ + 136.7582244873047, 96.51412963867188, + 152.51190185546875, 104.73160552978516 + ], + [ + 221.71812438964844, 97.86445617675781, + 238.9705810546875, 106.96803283691406 + ], + [ + 135.06964111328125, 91.80916595458984, 155.24609375, + 102.20686340332031 + ], + [ + 169.11180114746094, 97.53628540039062, + 182.88504028320312, 105.95404815673828 + ], + [ + 133.8811798095703, 91.00375366210938, + 145.35507202148438, 102.3780288696289 + ], + [ + 614.2507934570312, 102.19828796386719, + 636.5692749023438, 112.59198760986328 + ], + [ + 35.94759750366211, 91.7213363647461, + 70.38274383544922, 117.19855499267578 + ], + [ + 554.6401977539062, 115.18976593017578, + 562.0255737304688, 127.4429931640625 + ], + [ + 39.07550811767578, 92.73261260986328, + 85.36636352539062, 106.73953247070312 + ], + [ + 200.85513305664062, 93.00469970703125, + 219.73086547851562, 107.99642181396484 + ], + [ + 0.0, 111.18904876708984, 61.7393684387207, + 146.72547912597656 + ], + [ + 191.88568115234375, 111.09577178955078, + 299.4097900390625, 155.14639282226562 + ], + [ + 221.06834411621094, 176.6427001953125, + 458.3475341796875, 378.89300537109375 + ], + [ + 372.7131652832031, 135.51429748535156, + 433.2494201660156, 188.0106658935547 + ], + [ + 52.19819641113281, 110.3646011352539, + 70.95110321044922, 120.10567474365234 + ], + [ + 376.1671447753906, 133.6930694580078, + 432.2721862792969, 187.99481201171875 + ]]), decimal=1) + def test_detection_detector_single(self): + model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth' + img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg' + vitdet = DetectionPredictor(model_path, score_threshold=0.0) + output = vitdet(img) + output = output[0] + with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp_file: + tmp_save_path = tmp_file.name + vitdet.visualize(img, output, out_file=tmp_save_path) + self._detection_detector_assert(output) + + def test_detection_detector_batch(self): + model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth' + img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg' + vitdet = DetectionPredictor( + model_path, score_threshold=0.0, batch_size=2) + num_samples = 3 + images = [img] * num_samples + outputs = vitdet(images) + self.assertEqual(len(outputs), num_samples) + for output in outputs: + with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp_file: + tmp_save_path = tmp_file.name + vitdet.visualize(img, output, out_file=tmp_save_path) + self._detection_detector_assert(output) + if __name__ == '__main__': unittest.main() diff --git a/tests/predictors/test_detector_blade.py b/tests/predictors/test_detector_blade.py index 3f3aae65..143425a3 100644 --- a/tests/predictors/test_detector_blade.py +++ b/tests/predictors/test_detector_blade.py @@ -3,22 +3,14 @@ isort:skip_file """ import os -import tempfile import unittest -import cv2 import numpy as np from PIL import Image from easycv.predictors.detector import TorchYoloXPredictor -from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT, - PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_JIT, - PRETRAINED_MODEL_YOLOXS_PRE_NOTRT_JIT, - PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE, +from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE, PRETRAINED_MODEL_YOLOXS_PRE_NOTRT_BLADE, DET_DATA_SMALL_COCO_LOCAL) -from easycv.utils.test_util import benchmark -import logging -import pandas as pd import torch from numpy.testing import assert_array_almost_equal @@ -37,7 +29,6 @@ class DetectorTest(unittest.TestCase): input_data_list = [np.asarray(Image.open(img))] blade_path = PRETRAINED_MODEL_YOLOXS_NOPRE_NOTRT_BLADE - # blade_path = '/home/zouxinyi.zxy/easycv_nfs/pretrained_models/detection/infer_yolox/debug_blade.pt.blade' predictor_blade = TorchYoloXPredictor( model_path=blade_path, score_thresh=0.5) diff --git a/tests/predictors/test_face_keypoints_predictor.py b/tests/predictors/test_face_keypoints_predictor.py index 67482e51..3f62319a 100644 --- a/tests/predictors/test_face_keypoints_predictor.py +++ b/tests/predictors/test_face_keypoints_predictor.py @@ -19,7 +19,7 @@ class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase): def test_single(self): predict_pipeline = FaceKeypointsPredictor( model_path=self.model_path, config_file=self.model_config_path) - output = predict_pipeline(self.image_path)[0][0] + output = predict_pipeline(self.image_path)[0] output_keypoints = output['point'] output_pose = output['pose'] img = cv2.imread(self.image_path) @@ -38,18 +38,10 @@ class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase): total_samples = 3 output = predict_pipeline([self.image_path] * total_samples) - self.assertEqual(len(output), 2) - self.assertEqual(len(output[0]), 2) - self.assertEqual(len(output[1]), 1) - self.assertEqual(output[0][0]['point'].shape[0], 106) - self.assertEqual(output[0][0]['point'].shape[1], 2) - self.assertEqual(output[0][0]['pose'].shape[0], 3) - self.assertEqual(output[0][1]['point'].shape[0], 106) - self.assertEqual(output[0][1]['point'].shape[1], 2) - self.assertEqual(output[0][1]['pose'].shape[0], 3) - self.assertEqual(output[1][0]['point'].shape[0], 106) - self.assertEqual(output[1][0]['point'].shape[1], 2) - self.assertEqual(output[1][0]['pose'].shape[0], 3) + self.assertEqual(len(output), total_samples) + for out in output: + self.assertEqual(out['point'].shape, (106, 2)) + self.assertEqual(out['pose'].shape, (3, )) if __name__ == '__main__': diff --git a/tests/predictors/test_hand_keypoints_predictor.py b/tests/predictors/test_hand_keypoints_predictor.py index b2bca4cf..4a325098 100644 --- a/tests/predictors/test_hand_keypoints_predictor.py +++ b/tests/predictors/test_hand_keypoints_predictor.py @@ -39,6 +39,37 @@ class HandKeypointsPredictorTest(unittest.TestCase): self.assertEqual(keypoints.shape[1], 21) self.assertEqual(keypoints.shape[2], 3) + def test_batch(self): + config = mmcv_config_fromfile(self.model_config_path) + predict_pipeline = HandKeypointsPredictor( + model_path=self.model_path, + config_file=config, + batch_size=2, + detection_predictor_config=dict( + type='DetectionPredictor', + model_path=MM_DEFAULT_HAND_DETECTION_SSDLITE_MODEL_PATH, + config_file=MM_DEFAULT_HAND_DETECTION_SSDLITE_CONFIG_FILE, + score_threshold=0.5)) + + num_samples = 4 + outputs = predict_pipeline( + [self.image_path] * num_samples, keep_inputs=True) + base_keypoints = outputs[0]['keypoints'] + base_boxes = outputs[0]['boxes'] + for output in outputs: + keypoints = output['keypoints'] + boxes = output['boxes'] + image_show = predict_pipeline.show_result( + self.image_path, + keypoints, + boxes, + save_path=self.save_image_path) + self.assertEqual(keypoints.shape, (1, 21, 3)) + self.assertEqual(boxes.shape, (1, 4)) + self.assertListEqual(keypoints.tolist(), base_keypoints.tolist()) + self.assertListEqual(boxes.tolist(), base_boxes.tolist()) + self.assertEqual(output['inputs'], self.image_path) + if __name__ == '__main__': unittest.main() diff --git a/tests/predictors/test_segmentation.py b/tests/predictors/test_segmentation.py index e84a3e1a..5b36a2fb 100644 --- a/tests/predictors/test_segmentation.py +++ b/tests/predictors/test_segmentation.py @@ -8,6 +8,7 @@ import unittest import numpy as np from PIL import Image from tests.ut_config import (MODEL_CONFIG_SEGFORMER, + PRETRAINED_MODEL_MASK2FORMER_DIR, PRETRAINED_MODEL_SEGFORMER, TEST_IMAGES_DIR) from easycv.predictors.segmentation import SegmentationPredictor @@ -31,14 +32,14 @@ class SegmentationPredictorTest(unittest.TestCase): outputs = predict_pipeline(img_path, keep_inputs=True) self.assertEqual(len(outputs), 1) - self.assertEqual(outputs[0]['inputs'], [img_path]) + results = outputs[0] + self.assertEqual(results['inputs'], img_path) - results = outputs[0]['results'] self.assertListEqual( - list(img.shape)[:2], list(results['seg_pred'][0].shape)) - self.assertListEqual(results['seg_pred'][0][1, :10].tolist(), + list(img.shape)[:2], list(results['seg_pred'].shape)) + self.assertListEqual(results['seg_pred'][1, :10].tolist(), [161 for i in range(10)]) - self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(), + self.assertListEqual(results['seg_pred'][-1, -10:].tolist(), [133 for i in range(10)]) def test_batch(self): @@ -56,19 +57,15 @@ class SegmentationPredictorTest(unittest.TestCase): total_samples = 3 outputs = predict_pipeline( [img_path] * total_samples, keep_inputs=True) - self.assertEqual(len(outputs), 2) + self.assertEqual(len(outputs), 3) - self.assertEqual(outputs[0]['inputs'], [img_path] * 2) - self.assertEqual(outputs[1]['inputs'], [img_path] * 1) - self.assertEqual(len(outputs[0]['results']['seg_pred']), 2) - self.assertEqual(len(outputs[1]['results']['seg_pred']), 1) - - for result in [outputs[0]['results'], outputs[1]['results']]: + for i in range(len(outputs)): + self.assertEqual(outputs[i]['inputs'], img_path) self.assertListEqual( - list(img.shape)[:2], list(result['seg_pred'][0].shape)) - self.assertListEqual(result['seg_pred'][0][1, :10].tolist(), + list(img.shape)[:2], list(outputs[i]['seg_pred'].shape)) + self.assertListEqual(outputs[i]['seg_pred'][1, :10].tolist(), [161 for i in range(10)]) - self.assertListEqual(result['seg_pred'][0][-1, -10:].tolist(), + self.assertListEqual(outputs[i]['seg_pred'][-1, -10:].tolist(), [133 for i in range(10)]) def test_dump(self): @@ -91,17 +88,47 @@ class SegmentationPredictorTest(unittest.TestCase): total_samples = 3 outputs = predict_pipeline( - [img_path] * total_samples, keep_inputs=True) + [img_path] * total_samples, keep_inputs=False) self.assertEqual(outputs, []) with open(tmp_path, 'rb') as f: results = pickle.loads(f.read()) - self.assertIn('inputs', results[0]) - self.assertIn('results', results[0]) + for res in results: + self.assertNotIn('inputs', res) + self.assertIn('seg_pred', res) shutil.rmtree(temp_dir, ignore_errors=True) +@unittest.skipIf(True, 'WIP') +class Mask2formerPredictorTest(unittest.TestCase): + + def test_single(self): + import cv2 + from easycv.predictors.segmentation import Mask2formerPredictor + pan_ckpt = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR, + 'mask2former_pan_export.pth') + instance_ckpt = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR, + 'mask2former_r50_instance.pth') + img_path = os.path.join(TEST_IMAGES_DIR, 'mask2former.jpg') + + # panop + predictor = Mask2formerPredictor( + model_path=pan_ckpt, output_mode='panoptic') + img = cv2.imread(img_path) + predict_out = predictor([img]) + pan_img = predictor.show_panoptic(img, predict_out[0]['pan']) + cv2.imwrite('pan_out.jpg', pan_img) + + # instance + predictor = Mask2formerPredictor( + model_path=instance_ckpt, output_mode='instance') + img = cv2.imread(img_path) + predict_out = predictor.predict([img], mode='instance') + instance_img = predictor.show_instance(img, **predict_out[0]) + cv2.imwrite('instance_out.jpg', instance_img) + + if __name__ == '__main__': unittest.main() diff --git a/tests/predictors/test_segmentor.py b/tests/predictors/test_segmentor.py deleted file mode 100644 index 1ca3eece..00000000 --- a/tests/predictors/test_segmentor.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -""" -isort:skip_file -""" -import os -import unittest - -import numpy as np -from PIL import Image - -from tests.ut_config import TEST_IMAGES_DIR -from tests.ut_config import (PRETRAINED_MODEL_SEGFORMER, - MODEL_CONFIG_SEGFORMER) -from easycv.predictors.segmentation import SegFormerPredictor - - -class SegmentorTest(unittest.TestCase): - - def setUp(self): - print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) - - def test_segformer_detector(self): - segmentation_model_path = PRETRAINED_MODEL_SEGFORMER - segmentation_model_config = MODEL_CONFIG_SEGFORMER - - img = os.path.join(TEST_IMAGES_DIR, '000000289059.jpg') - if not os.path.exists(img): - img = './data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg' - - input_data_list = [np.asarray(Image.open(img))] - predictor = SegFormerPredictor( - model_path=segmentation_model_path, - model_config=segmentation_model_config) - - output = predictor.predict(input_data_list)[0] - self.assertIn('seg_pred', output) - - self.assertListEqual( - list(input_data_list[0].shape)[:2], - list(output['seg_pred'][0].shape)) - self.assertListEqual(output['seg_pred'][0][1, :10].tolist(), - [161 for i in range(10)]) - self.assertListEqual(output['seg_pred'][0][-1, -10:].tolist(), - [133 for i in range(10)]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/ut_config.py b/tests/ut_config.py index 7a324a69..42c8620b 100644 --- a/tests/ut_config.py +++ b/tests/ut_config.py @@ -45,7 +45,8 @@ SMALL_IMAGENET_TFRECORD_OSS = os.path.join( BASE_OSS_PATH, 'data/classification/small_imagenet_tfrecord/') IO_DATA_TXTX_OSS = os.path.join(BASE_OSS_PATH, 'data/io_test_dir/txts/') - +IO_DATA_MULTI_DIRS_OSS = os.path.join(BASE_OSS_PATH, + 'data/io_test_dir/multi_dirs/') DET_DATA_SMALL_COCO_LOCAL = os.path.join(BASE_LOCAL_PATH, 'data/detection/small_coco') @@ -119,10 +120,10 @@ PRETRAINED_MODEL_YOLOX_COMPRESSION = os.path.join( BASE_LOCAL_PATH, 'pretrained_models/compression/yolox_compression.pth') PRETRAINED_MODEL_MAE = os.path.join( BASE_LOCAL_PATH, 'pretrained_models/classification/vit/mae_vit_b_1600.pth') -PRETRAINED_MODEL_MASK2FORMER = os.path.join( - BASE_LOCAL_PATH, - 'pretrained_models/segmentation/mask2former/mask2former_r50_instance.pth') - +PRETRAINED_MODEL_MASK2FORMER_DIR = os.path.join( + BASE_LOCAL_PATH, 'pretrained_models/segmentation/mask2former/') +PRETRAINED_MODEL_MASK2FORMER = os.path.join(PRETRAINED_MODEL_MASK2FORMER_DIR, + 'mask2former_r50_instance.pth') PRETRAINED_MODEL_SEGFORMER = os.path.join( BASE_LOCAL_PATH, 'pretrained_models/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth' diff --git a/tests/utils/test_json_utils.py b/tests/utils/test_json_utils.py index f3d60696..7c906ae6 100644 --- a/tests/utils/test_json_utils.py +++ b/tests/utils/test_json_utils.py @@ -21,6 +21,7 @@ import tempfile import unittest from easycv.file import io +from easycv.framework.errors import ValueError from easycv.utils import json_utils diff --git a/thirdparty/u2sod/sodpredictor.py b/thirdparty/u2sod/sodpredictor.py index a336c1ac..ff981379 100644 --- a/thirdparty/u2sod/sodpredictor.py +++ b/thirdparty/u2sod/sodpredictor.py @@ -21,6 +21,7 @@ except: from easycv.predictors.builder import build_predictor, PREDICTORS +from easycv.utils.constant import CACHE_DIR def normPRED(d): @@ -47,8 +48,8 @@ class SODPredictor(object): """ def load_url_weights(name, url_index="http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/evtorch_thirdparty/u2net_sod/", map_location=None): - os.makedirs('.easycv_cache', exist_ok=True) - local_model = os.path.join('.easycv_cache', name+'.pth') + os.makedirs(CACHE_DIR, exist_ok=True) + local_model = os.path.join(CACHE_DIR, name+'.pth') if os.path.exists(local_model): weights = torch.load(local_model) if weights is not None: diff --git a/tools/eval.py b/tools/eval.py index 60be08df..66b69f44 100644 --- a/tools/eval.py +++ b/tools/eval.py @@ -32,6 +32,7 @@ from easycv.utils.config_tools import (CONFIG_TEMPLATE_ZOO, from easycv.utils.mmlab_utils import dynamic_adapt_for_mmlab from easycv.utils.setup_env import setup_multi_processes +from easycv.framework.errors import ValueError, NotImplementedError from easycv.utils.misc import reparameterize_models @@ -251,8 +252,7 @@ def main(): eval_kwargs.update(args.options) if args.inference_only: - raise RuntimeError('not implemented') - dataset.format_results(outputs, **eval_kwargs) + raise NotImplementedError('not implemented') if args.eval: for t in eval_pipe.evaluators: if 'metric_type' in t: diff --git a/tools/train.py b/tools/train.py index 2241e760..96f93db8 100644 --- a/tools/train.py +++ b/tools/train.py @@ -273,8 +273,9 @@ def main(): drop_last=getattr(cfg.data, 'drop_last', False), reuse_worker_cache=cfg.data.get('reuse_worker_cache', False), persistent_workers=cfg.data.get('persistent_workers', False), - collate_hooks=cfg.data.get('train_collate_hooks', [])) - for ds in datasets + collate_hooks=cfg.data.get('train_collate_hooks', []), + use_repeated_augment_sampler=cfg.data.get( + 'use_repeated_augment_sampler', False)) for ds in datasets ] else: default_args = dict(