diff --git a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml b/configs/rec/rec_svtrnet.yml similarity index 72% rename from configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml rename to configs/rec/rec_svtrnet.yml index 2bd9970b0..9859b4593 100644 --- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml +++ b/configs/rec/rec_svtrnet.yml @@ -3,7 +3,7 @@ Global: epoch_num: 20 log_smooth_window: 20 print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_tiny_en/ + save_model_dir: ./output/rec/svtr_tiny/ save_epoch_step: 1 # evaluation is run every 2000 iterations after the 0th iteration eval_batch_step: [0, 2000] @@ -47,16 +47,16 @@ Architecture: stn_activation: none Backbone: name: SVTRNet - img_size: [32, 100] # input size 可以尝试[64,200] - out_char_num: 25 # output char patch - out_channels: 192 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim - depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - last_stage: True # 三个阶段中的sub-patch heads + img_size: [32, 100] + out_char_num: 25 + out_channels: 192 + patch_merging: 'Conv' + embed_dim: [64, 128, 256] + depth: [3, 6, 3] + num_heads: [2, 4, 8] + mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] + local_mixer: [[7, 11], [7, 11], [7, 11]] + last_stage: True prenorm: false Neck: name: SequenceEncoder @@ -93,12 +93,12 @@ Train: shuffle: True batch_size_per_card: 512 drop_last: True - num_workers: 2 + num_workers: 4 Eval: dataset: name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluation/ + data_dir: ./train_data/data_lmdb_release/validation/ transforms: - DecodeImage: # load image img_mode: BGR diff --git a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml deleted file mode 100644 index 8534e7887..000000000 --- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml +++ /dev/null @@ -1,113 +0,0 @@ -Global: - use_gpu: True - epoch_num: 100 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_base_stn_ch/ - save_epoch_step: 10 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words/ch/word_1.jpg - # for data or label process - character_dict_path: ppocr/utils/ppocr_keys_v1.txt - max_text_length: 40 - infer_mode: False - use_space_char: True - save_res_path: ./output/rec/predicts_svtr_base_ch.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0003 - warmup_epoch: 5 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 320] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 320] # input size 可以尝试[64,200] - out_char_num: 40 # output char patch - out_channels: 256 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim - depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - prenorm: False - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/ch_scene - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 128 - drop_last: True - num_workers: 2 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/scene_test - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 256 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml deleted file mode 100644 index 2b7546c4d..000000000 --- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml +++ /dev/null @@ -1,117 +0,0 @@ -Global: - use_gpu: True - epoch_num: 20 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_base_stn_en/ - save_epoch_step: 1 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words_en/word_10.png - # for data or label process - character_dict_path: - character_type: en - max_text_length: 25 - infer_mode: False - use_space_char: False - save_res_path: ./output/rec/predicts_svtr_base.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.00025 - warmup_epoch: 2 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [48, 160] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [48, 160] # input size 可以尝试[64,200] - out_char_num: 40 # output char patch - out_channels: 256 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim - depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - last_stage: True - prenorm: False - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/training - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 256 - drop_last: True - num_workers: 4 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluation/ - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 128 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml deleted file mode 100644 index 68f0608f0..000000000 --- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml +++ /dev/null @@ -1,113 +0,0 @@ -Global: - use_gpu: True - epoch_num: 100 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_large_ch/ - save_epoch_step: 10 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words/ch/word_1.jpg - # for data or label process - character_dict_path: ppocr/utils/ppocr_keys_v1.txt - max_text_length: 40 - infer_mode: False - use_space_char: True - save_res_path: ./output/rec/predicts_svtr_large_ch.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0003 - warmup_epoch: 5 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 320] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 320] # input size 可以尝试[64,200] - out_char_num: 40 # output char patch - out_channels: 384 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim - depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - prenorm: False - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/ch_scene - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 128 - drop_last: True - num_workers: 2 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/scene_test - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 256 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml deleted file mode 100644 index b995bb81a..000000000 --- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml +++ /dev/null @@ -1,117 +0,0 @@ -Global: - use_gpu: True - epoch_num: 20 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_large_en/ - save_epoch_step: 1 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words_en/word_10.png - # for data or label process - character_dict_path: - character_type: en - max_text_length: 25 - infer_mode: False - use_space_char: False - save_res_path: ./output/rec/predicts_svtr_large.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.000125 - warmup_epoch: 2 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [48, 160] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [48, 160] # input size 可以尝试[64,200] - out_char_num: 40 # output char patch - out_channels: 384 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim - depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - prenorm: false - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/training - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - RecAug: - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 128 - drop_last: True - num_workers: 2 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluation/ - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 128 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml deleted file mode 100644 index 38ddb0e4f..000000000 --- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml +++ /dev/null @@ -1,114 +0,0 @@ -Global: - use_gpu: True - epoch_num: 100 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_small_ch/ - save_epoch_step: 10 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words/ch/word_1.jpg - # for data or label process - character_dict_path: ppocr/utils/ppocr_keys_v1.txt - max_text_length: 40 - infer_mode: False - use_space_char: True - save_res_path: ./output/rec/predicts_svtr_small_ch.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0003 - warmup_epoch: 5 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 320] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 320] # input size 可以尝试[64,200] - out_char_num: 40 # output char patch - out_channels: 192 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim - depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - last_stage: True - prenorm: False - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/ch_scene - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 128 - drop_last: True - num_workers: 2 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/scene_test - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 256 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml deleted file mode 100644 index 69fea2e1f..000000000 --- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml +++ /dev/null @@ -1,117 +0,0 @@ -Global: - use_gpu: True - epoch_num: 20 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_small_stn_en/ - save_epoch_step: 1 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words_en/word_10.png - # for data or label process - character_dict_path: - character_type: en - max_text_length: 25 - infer_mode: False - use_space_char: False - save_res_path: ./output/rec/predicts_svtr_small.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0005 - warmup_epoch: 2 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 100] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 100] # input size 可以尝试[64,200] - out_char_num: 25 # output char patch - out_channels: 192 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim - depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - last_stage: True - prenorm: False - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/training - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 512 - drop_last: True - num_workers: 4 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/data_lmdb_release/evaluation - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training - character_dict_path: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 256 - num_workers: 2 diff --git a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml b/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml deleted file mode 100644 index e0d77f632..000000000 --- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml +++ /dev/null @@ -1,114 +0,0 @@ -Global: - use_gpu: True - epoch_num: 100 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/rec/rec_svtr_tiny_ch/ - save_epoch_step: 10 - # evaluation is run every 2000 iterations after the 0th iteration - eval_batch_step: [0, 2000] - cal_metric_during_train: True - pretrained_model: - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_words/ch/word_1.jpg - # for data or label process - character_dict_path: ppocr/utils/ppocr_keys_v1.txt - max_text_length: 40 - infer_mode: False - use_space_char: True - save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt - - -Optimizer: - name: AdamW - beta1: 0.9 - beta2: 0.99 - epsilon: 0.00000008 - weight_decay: 0.05 - no_weight_decay_name: norm pos_embed - one_dim_param_no_weight_decay: true - lr: - name: Cosine - learning_rate: 0.0003 - warmup_epoch: 5 - -Architecture: - model_type: rec - algorithm: SVTR - Transform: - name: STN_ON - tps_inputsize: [32, 64] - tps_outputsize: [32, 320] - num_control_points: 20 - tps_margins: [0.05,0.05] - stn_activation: none - Backbone: - name: SVTRNet - img_size: [32, 320] # input size - out_char_num: 40 # number char patch - out_channels: 192 # char patch dim - patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None - embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim - depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数 - num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads - mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv - local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围 - last_stage: True # 三个阶段中的sub-patch heads - prenorm: false - Neck: - name: SequenceEncoder - encoder_type: reshape - Head: - name: CTCHead - -Loss: - name: CTCLoss - -PostProcess: - name: CTCLabelDecode - -Metric: - name: RecMetric - main_indicator: acc - -Train: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/ch_scene - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: True - batch_size_per_card: 128 - drop_last: True - num_workers: 2 - -Eval: - dataset: - name: LMDBDataSet - data_dir: ./train_data/scene_ch/scene_test - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - CTCLabelEncode: # Class handling label - - RecResizeImg: - image_shape: [3, 64, 256] - padding: False - - KeepKeys: - keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order - loader: - shuffle: False - drop_last: False - batch_size_per_card: 256 - num_workers: 2 diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py index 1dffaab0e..355ba77f8 100755 --- a/deploy/slim/quantization/quant.py +++ b/deploy/slim/quantization/quant.py @@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer): config['Optimizer'], epochs=config['Global']['epoch_num'], step_each_epoch=len(train_dataloader), - parameters=model.parameters()) + model=model) # resume PACT training process if config["Global"]["checkpoints"] is not None: diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py index e3fb4d7eb..501ef87ce 100644 --- a/ppocr/data/imaug/rec_img_aug.py +++ b/ppocr/data/imaug/rec_img_aug.py @@ -209,17 +209,16 @@ class PRENResizeImg(object): class SVTRRecResizeImg(object): def __init__(self, - image_shape, - infer_mode=False, - character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', - padding=True, - **kwargs): + image_shape, + infer_mode=False, + character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', + padding=True, + **kwargs): self.image_shape = image_shape self.infer_mode = infer_mode self.character_dict_path = character_dict_path self.padding = padding - def __call__(self, data): img = data['image'] norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding) @@ -227,7 +226,6 @@ class SVTRRecResizeImg(object): return data - def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): imgC, imgH, imgW_min, imgW_max = image_shape h = img.shape[0] @@ -346,23 +344,21 @@ def resize_norm_img_srn(img, image_shape): return np.reshape(img_black, (c, row, col)).astype(np.float32) - -def resize_norm_img_svtr(img, image_shape, padding=True): +def resize_norm_img_svtr(img, image_shape, padding=False): imgC, imgH, imgW = image_shape h = img.shape[0] w = img.shape[1] if not padding: - if h > 2.0 * w: - image = Image.fromarray(img) - image1 = image.rotate(90, expand=True) - image2 = image.rotate(-90, expand=True) - img1 = np.array(image1) - img2 = np.array(image2) + image = Image.fromarray(img) + image1 = image.rotate(90, expand=True) + image2 = image.rotate(-90, expand=True) + img1 = np.array(image1) + img2 = np.array(image2) else: - img1 = copy.deepcopy(img) - img2 = copy.deepcopy(img) - + img1 = copy.deepcopy(img) + img2 = copy.deepcopy(img) + resized_image = cv2.resize( img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) resized_image1 = cv2.resize( diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py index 2b2c6c6de..5ded74378 100644 --- a/ppocr/modeling/backbones/rec_svtrnet.py +++ b/ppocr/modeling/backbones/rec_svtrnet.py @@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer): if sub_num == 2: self.proj = nn.Sequential( ConvBNLayer( - in_channels, - embed_dim // 2, - 3, - 2, - 1, + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, act=nn.GELU, bias_attr=None), ConvBNLayer( - embed_dim // 2, - embed_dim, - 3, - 2, - 1, + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, act=nn.GELU, bias_attr=None)) if sub_num == 3: self.proj = nn.Sequential( ConvBNLayer( - in_channels, - embed_dim // 4, - 3, - 2, - 1, + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, act=nn.GELU, bias_attr=None), ConvBNLayer( - embed_dim // 4, - embed_dim // 2, - 3, - 2, - 1, + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, act=nn.GELU, bias_attr=None), ConvBNLayer( embed_dim // 2, embed_dim, - 3, - 2, - 1, + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, act=nn.GELU, - bias_attr=None), ) + bias_attr=None)) def forward(self, x): B, C, H, W = x.shape