delete yml file, fix quant and svtr

2022-04-26 12:59:48 +00:00 · 2022-04-26 12:59:48 +00:00 · 1c75ff630e
parent 68fb057dc6
commit 1c75ff630e
11 changed files with 54 additions and 861 deletions
--- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml
+++ b/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml
@ -3,7 +3,7 @@ Global:
  epoch_num: 20
  log_smooth_window: 20
  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_tiny_en/
+  save_model_dir: ./output/rec/svtr_tiny/
  save_epoch_step: 1
  # evaluation is run every 2000 iterations after the 0th iteration
  eval_batch_step: [0, 2000]
@ -47,16 +47,16 @@ Architecture:
    stn_activation: none
  Backbone:
    name: SVTRNet
-    img_size: [32, 100]  # input size 可以尝试[64,200]
-    out_char_num: 25     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 3]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [2, 4, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True      # 三个阶段中的sub-patch heads
+    img_size: [32, 100]
+    out_char_num: 25
+    out_channels: 192
+    patch_merging: 'Conv'
+    embed_dim: [64, 128, 256]
+    depth: [3, 6, 3]
+    num_heads: [2, 4, 8]
+    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[7, 11], [7, 11], [7, 11]]
+    last_stage: True
    prenorm: false
  Neck:
    name: SequenceEncoder
@ -93,12 +93,12 @@ Train:
    shuffle: True
    batch_size_per_card: 512
    drop_last: True
-    num_workers: 2
+    num_workers: 4

 Eval:
  dataset:
    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
+    data_dir: ./train_data/data_lmdb_release/validation/
    transforms:
      - DecodeImage: # load image
          img_mode: BGR
--- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml
+++ b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml
@ -1,113 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_base_stn_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_base_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 256    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
-    depth: [3, 6, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [4, 8, 12]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml
+++ b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml
@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_base_stn_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_base.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.00025
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [48, 160]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [48, 160]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 256    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
-    depth: [3, 6, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [4, 8, 12]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 256
-    drop_last: True
-    num_workers: 4
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 128
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml
+++ b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml
@ -1,113 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_large_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_large_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 384    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
-    depth: [3, 9, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [6, 8, 16]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml
+++ b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml
@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_large_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints: 
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_large.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.000125
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [48, 160]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [48, 160]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 384    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
-    depth: [3, 9, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [6, 8, 16]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: false
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - RecAug:
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 128
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml
+++ b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml
@ -1,114 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_small_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_small_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 6]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [3, 6, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml
+++ b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml
@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_small_stn_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_small.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0005
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 100]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 100]  # input size 可以尝试[64,200]
-    out_char_num: 25     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 6]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [3, 6, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 512
-    drop_last: True
-    num_workers: 4
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
--- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml
+++ b/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml
@ -1,114 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_tiny_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size
-    out_char_num: 40     # number char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 3]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [2, 4, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True      # 三个阶段中的sub-patch heads
-    prenorm: false
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
--- a/deploy/slim/quantization/quant.py
+++ b/deploy/slim/quantization/quant.py
@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer):
        config['Optimizer'],
        epochs=config['Global']['epoch_num'],
        step_each_epoch=len(train_dataloader),
-        parameters=model.parameters())
+        model=model)

    # resume PACT training process
    if config["Global"]["checkpoints"] is not None:
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@ -209,17 +209,16 @@ class PRENResizeImg(object):

 class SVTRRecResizeImg(object):
    def __init__(self,
-                        image_shape,
-                        infer_mode=False,
-                        character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
-                        padding=True,
-                        **kwargs):
+                 image_shape,
+                 infer_mode=False,
+                 character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
+                 padding=True,
+                 **kwargs):
        self.image_shape = image_shape
        self.infer_mode = infer_mode
        self.character_dict_path = character_dict_path
        self.padding = padding

-
    def __call__(self, data):
        img = data['image']
        norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
@ -227,7 +226,6 @@ class SVTRRecResizeImg(object):
        return data


-
 def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
    imgC, imgH, imgW_min, imgW_max = image_shape
    h = img.shape[0]
@ -346,23 +344,21 @@ def resize_norm_img_srn(img, image_shape):
    return np.reshape(img_black, (c, row, col)).astype(np.float32)


-
-def resize_norm_img_svtr(img, image_shape, padding=True):
+def resize_norm_img_svtr(img, image_shape, padding=False):
    imgC, imgH, imgW = image_shape
    h = img.shape[0]
    w = img.shape[1]
    if not padding:
-        
        if h > 2.0 * w:
-                image = Image.fromarray(img) 
-                image1 = image.rotate(90, expand=True)
-                image2 = image.rotate(-90, expand=True)
-                img1 = np.array(image1)
-                img2 = np.array(image2)
+            image = Image.fromarray(img)
+            image1 = image.rotate(90, expand=True)
+            image2 = image.rotate(-90, expand=True)
+            img1 = np.array(image1)
+            img2 = np.array(image2)
        else:
-                img1 = copy.deepcopy(img)
-                img2 = copy.deepcopy(img)
-        
+            img1 = copy.deepcopy(img)
+            img2 = copy.deepcopy(img)
+
        resized_image = cv2.resize(
            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
        resized_image1 = cv2.resize(
--- a/ppocr/modeling/backbones/rec_svtrnet.py
+++ b/ppocr/modeling/backbones/rec_svtrnet.py
@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer):
        if sub_num == 2:
            self.proj = nn.Sequential(
                ConvBNLayer(
-                    in_channels,
-                    embed_dim // 2,
-                    3,
-                    2,
-                    1,
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                    act=nn.GELU,
                    bias_attr=None),
                ConvBNLayer(
-                    embed_dim // 2,
-                    embed_dim,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                    act=nn.GELU,
                    bias_attr=None))
        if sub_num == 3:
            self.proj = nn.Sequential(
                ConvBNLayer(
-                    in_channels,
-                    embed_dim // 4,
-                    3,
-                    2,
-                    1,
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 4,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                    act=nn.GELU,
                    bias_attr=None),
                ConvBNLayer(
-                    embed_dim // 4,
-                    embed_dim // 2,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 4,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                    act=nn.GELU,
                    bias_attr=None),
                ConvBNLayer(
                    embed_dim // 2,
                    embed_dim,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                    act=nn.GELU,
-                    bias_attr=None), )
+                    bias_attr=None))

    def forward(self, x):
        B, C, H, W = x.shape