diff --git a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml b/configs/rec/rec_svtrnet.yml
similarity index 72%
rename from configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml
rename to configs/rec/rec_svtrnet.yml
index 2bd9970b08..9859b45938 100644
--- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_en.yml
+++ b/configs/rec/rec_svtrnet.yml
@@ -3,7 +3,7 @@ Global:
   epoch_num: 20
   log_smooth_window: 20
   print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_tiny_en/
+  save_model_dir: ./output/rec/svtr_tiny/
   save_epoch_step: 1
   # evaluation is run every 2000 iterations after the 0th iteration
   eval_batch_step: [0, 2000]
@@ -47,16 +47,16 @@ Architecture:
     stn_activation: none
   Backbone:
     name: SVTRNet
-    img_size: [32, 100]  # input size 可以尝试[64,200]
-    out_char_num: 25     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 3]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [2, 4, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True      # 三个阶段中的sub-patch heads
+    img_size: [32, 100]
+    out_char_num: 25
+    out_channels: 192
+    patch_merging: 'Conv'
+    embed_dim: [64, 128, 256]
+    depth: [3, 6, 3]
+    num_heads: [2, 4, 8]
+    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[7, 11], [7, 11], [7, 11]]
+    last_stage: True
     prenorm: false
   Neck:
     name: SequenceEncoder
@@ -93,12 +93,12 @@ Train:
     shuffle: True
     batch_size_per_card: 512
     drop_last: True
-    num_workers: 2
+    num_workers: 4
 
 Eval:
   dataset:
     name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
+    data_dir: ./train_data/data_lmdb_release/validation/
     transforms:
       - DecodeImage: # load image
           img_mode: BGR
diff --git a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml
deleted file mode 100644
index 8534e78874..0000000000
--- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_ch.yml
+++ /dev/null
@@ -1,113 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_base_stn_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_base_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 256    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
-    depth: [3, 6, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [4, 8, 12]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml b/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml
deleted file mode 100644
index 2b7546c4d2..0000000000
--- a/configs/rec/svtr/rec_svtr_base_8local_10global_stn_en.yml
+++ /dev/null
@@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_base_stn_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_base.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.00025
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [48, 160]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [48, 160]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 256    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
-    depth: [3, 6, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [4, 8, 12]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 256
-    drop_last: True
-    num_workers: 4
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 128
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml
deleted file mode 100644
index 68f0608f01..0000000000
--- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_ch.yml
+++ /dev/null
@@ -1,113 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_large_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_large_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 384    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
-    depth: [3, 9, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [6, 8, 16]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml b/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml
deleted file mode 100644
index b995bb81a2..0000000000
--- a/configs/rec/svtr/rec_svtr_large_10local_11global_stn_en.yml
+++ /dev/null
@@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_large_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints: 
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_large.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.000125
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [48, 160]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [48, 160]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 384    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
-    depth: [3, 9, 9]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [6, 8, 16]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    prenorm: false
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - RecAug:
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation/
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 128
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml
deleted file mode 100644
index 38ddb0e4fc..0000000000
--- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_ch.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_small_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_small_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size 可以尝试[64,200]
-    out_char_num: 40     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 6]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [3, 6, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml b/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml
deleted file mode 100644
index 69fea2e1f6..0000000000
--- a/configs/rec/svtr/rec_svtr_small_8local_7global_stn_en.yml
+++ /dev/null
@@ -1,117 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 20
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_small_stn_en/
-  save_epoch_step: 1
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words_en/word_10.png
-  # for data or label process
-  character_dict_path:
-  character_type: en
-  max_text_length: 25
-  infer_mode: False
-  use_space_char: False
-  save_res_path: ./output/rec/predicts_svtr_small.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0005
-    warmup_epoch: 2
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 100]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 100]  # input size 可以尝试[64,200]
-    out_char_num: 25     # output char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 6]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [3, 6, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True
-    prenorm: False
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/training
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 512
-    drop_last: True
-    num_workers: 4
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/data_lmdb_release/evaluation
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
-          character_dict_path:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
diff --git a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml b/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml
deleted file mode 100644
index e0d77f632c..0000000000
--- a/configs/rec/svtr/rec_svtr_tiny_6local_6global_stn_ch.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-Global:
-  use_gpu: True
-  epoch_num: 100
-  log_smooth_window: 20
-  print_batch_step: 10
-  save_model_dir: ./output/rec/rec_svtr_tiny_ch/
-  save_epoch_step: 10
-  # evaluation is run every 2000 iterations after the 0th iteration
-  eval_batch_step: [0, 2000]
-  cal_metric_during_train: True
-  pretrained_model:
-  checkpoints:
-  save_inference_dir:
-  use_visualdl: False
-  infer_img: doc/imgs_words/ch/word_1.jpg
-  # for data or label process
-  character_dict_path: ppocr/utils/ppocr_keys_v1.txt
-  max_text_length: 40
-  infer_mode: False
-  use_space_char: True
-  save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt
-
-
-Optimizer:
-  name: AdamW
-  beta1: 0.9
-  beta2: 0.99
-  epsilon: 0.00000008
-  weight_decay: 0.05
-  no_weight_decay_name: norm pos_embed
-  one_dim_param_no_weight_decay: true
-  lr:
-    name: Cosine
-    learning_rate: 0.0003
-    warmup_epoch: 5
-
-Architecture:
-  model_type: rec
-  algorithm: SVTR
-  Transform:
-    name: STN_ON
-    tps_inputsize: [32, 64]
-    tps_outputsize: [32, 320]
-    num_control_points: 20
-    tps_margins: [0.05,0.05]
-    stn_activation: none
-  Backbone:
-    name: SVTRNet
-    img_size: [32, 320]  # input size
-    out_char_num: 40     # number char patch
-    out_channels: 192    # char patch dim
-    patch_merging: 'Conv'        # 是否使用patch-merging 可选Conv Pool None
-    embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
-    depth: [3, 6, 3]           # 当使用patch-merging时，控制patch-merging所在的层数，分成三阶段，每个阶段的层数
-    num_heads: [2, 4, 8]       # 三个阶段中的sub-patch heads
-    mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
-    local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围，7表示高度的范围，11表示宽度的范围
-    last_stage: True      # 三个阶段中的sub-patch heads
-    prenorm: false
-  Neck:
-    name: SequenceEncoder
-    encoder_type: reshape
-  Head:
-    name: CTCHead
-
-Loss:
-  name: CTCLoss
-
-PostProcess:
-  name: CTCLabelDecode
-
-Metric:
-  name: RecMetric
-  main_indicator: acc
-
-Train:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/ch_scene
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: True
-    batch_size_per_card: 128
-    drop_last: True
-    num_workers: 2
-
-Eval:
-  dataset:
-    name: LMDBDataSet
-    data_dir: ./train_data/scene_ch/scene_test
-    transforms:
-      - DecodeImage: # load image
-          img_mode: BGR
-          channel_first: False
-      - CTCLabelEncode: # Class handling label
-      - RecResizeImg:
-          image_shape: [3, 64, 256]
-          padding: False
-      - KeepKeys:
-          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
-  loader:
-    shuffle: False
-    drop_last: False
-    batch_size_per_card: 256
-    num_workers: 2
diff --git a/deploy/slim/quantization/quant.py b/deploy/slim/quantization/quant.py
index 1dffaab0ee..355ba77f83 100755
--- a/deploy/slim/quantization/quant.py
+++ b/deploy/slim/quantization/quant.py
@@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer):
         config['Optimizer'],
         epochs=config['Global']['epoch_num'],
         step_each_epoch=len(train_dataloader),
-        parameters=model.parameters())
+        model=model)
 
     # resume PACT training process
     if config["Global"]["checkpoints"] is not None:
diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py
index e3fb4d7eb3..501ef87ce9 100644
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@@ -209,17 +209,16 @@ class PRENResizeImg(object):
 
 class SVTRRecResizeImg(object):
     def __init__(self,
-                        image_shape,
-                        infer_mode=False,
-                        character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
-                        padding=True,
-                        **kwargs):
+                 image_shape,
+                 infer_mode=False,
+                 character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
+                 padding=True,
+                 **kwargs):
         self.image_shape = image_shape
         self.infer_mode = infer_mode
         self.character_dict_path = character_dict_path
         self.padding = padding
 
-
     def __call__(self, data):
         img = data['image']
         norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
@@ -227,7 +226,6 @@ class SVTRRecResizeImg(object):
         return data
 
 
-
 def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
     imgC, imgH, imgW_min, imgW_max = image_shape
     h = img.shape[0]
@@ -346,23 +344,21 @@ def resize_norm_img_srn(img, image_shape):
     return np.reshape(img_black, (c, row, col)).astype(np.float32)
 
 
-
-def resize_norm_img_svtr(img, image_shape, padding=True):
+def resize_norm_img_svtr(img, image_shape, padding=False):
     imgC, imgH, imgW = image_shape
     h = img.shape[0]
     w = img.shape[1]
     if not padding:
-        
         if h > 2.0 * w:
-                image = Image.fromarray(img) 
-                image1 = image.rotate(90, expand=True)
-                image2 = image.rotate(-90, expand=True)
-                img1 = np.array(image1)
-                img2 = np.array(image2)
+            image = Image.fromarray(img)
+            image1 = image.rotate(90, expand=True)
+            image2 = image.rotate(-90, expand=True)
+            img1 = np.array(image1)
+            img2 = np.array(image2)
         else:
-                img1 = copy.deepcopy(img)
-                img2 = copy.deepcopy(img)
-        
+            img1 = copy.deepcopy(img)
+            img2 = copy.deepcopy(img)
+
         resized_image = cv2.resize(
             img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
         resized_image1 = cv2.resize(
diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py
index 2b2c6c6de5..5ded74378c 100644
--- a/ppocr/modeling/backbones/rec_svtrnet.py
+++ b/ppocr/modeling/backbones/rec_svtrnet.py
@@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer):
         if sub_num == 2:
             self.proj = nn.Sequential(
                 ConvBNLayer(
-                    in_channels,
-                    embed_dim // 2,
-                    3,
-                    2,
-                    1,
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                     act=nn.GELU,
                     bias_attr=None),
                 ConvBNLayer(
-                    embed_dim // 2,
-                    embed_dim,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                     act=nn.GELU,
                     bias_attr=None))
         if sub_num == 3:
             self.proj = nn.Sequential(
                 ConvBNLayer(
-                    in_channels,
-                    embed_dim // 4,
-                    3,
-                    2,
-                    1,
+                    in_channels=in_channels,
+                    out_channels=embed_dim // 4,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                     act=nn.GELU,
                     bias_attr=None),
                 ConvBNLayer(
-                    embed_dim // 4,
-                    embed_dim // 2,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 4,
+                    out_channels=embed_dim // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                     act=nn.GELU,
                     bias_attr=None),
                 ConvBNLayer(
                     embed_dim // 2,
                     embed_dim,
-                    3,
-                    2,
-                    1,
+                    in_channels=embed_dim // 2,
+                    out_channels=embed_dim,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
                     act=nn.GELU,
-                    bias_attr=None), )
+                    bias_attr=None))
 
     def forward(self, x):
         B, C, H, W = x.shape