delete yml file, fix quant and svtr
parent
68fb057dc6
commit
1c75ff630e
|
@ -3,7 +3,7 @@ Global:
|
|||
epoch_num: 20
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_tiny_en/
|
||||
save_model_dir: ./output/rec/svtr_tiny/
|
||||
save_epoch_step: 1
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
|
@ -47,16 +47,16 @@ Architecture:
|
|||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 100] # input size 可以尝试[64,200]
|
||||
out_char_num: 25 # output char patch
|
||||
out_channels: 192 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
last_stage: True # 三个阶段中的sub-patch heads
|
||||
img_size: [32, 100]
|
||||
out_char_num: 25
|
||||
out_channels: 192
|
||||
patch_merging: 'Conv'
|
||||
embed_dim: [64, 128, 256]
|
||||
depth: [3, 6, 3]
|
||||
num_heads: [2, 4, 8]
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global']
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]]
|
||||
last_stage: True
|
||||
prenorm: false
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
|
@ -93,12 +93,12 @@ Train:
|
|||
shuffle: True
|
||||
batch_size_per_card: 512
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/evaluation/
|
||||
data_dir: ./train_data/data_lmdb_release/validation/
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
|
@ -1,113 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 100
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_base_stn_ch/
|
||||
save_epoch_step: 10
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 40
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_svtr_base_ch.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.0003
|
||||
warmup_epoch: 5
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [32, 320]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 320] # input size 可以尝试[64,200]
|
||||
out_char_num: 40 # output char patch
|
||||
out_channels: 256 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
prenorm: False
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/ch_scene
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 128
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/scene_test
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 2
|
|
@ -1,117 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 20
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_base_stn_en/
|
||||
save_epoch_step: 1
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words_en/word_10.png
|
||||
# for data or label process
|
||||
character_dict_path:
|
||||
character_type: en
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: False
|
||||
save_res_path: ./output/rec/predicts_svtr_base.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.00025
|
||||
warmup_epoch: 2
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [48, 160]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [48, 160] # input size 可以尝试[64,200]
|
||||
out_char_num: 40 # output char patch
|
||||
out_channels: 256 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [128, 256, 384] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [4, 8, 12] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
last_stage: True
|
||||
prenorm: False
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/training
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 256
|
||||
drop_last: True
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/evaluation/
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 128
|
||||
num_workers: 2
|
|
@ -1,113 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 100
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_large_ch/
|
||||
save_epoch_step: 10
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 40
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_svtr_large_ch.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.0003
|
||||
warmup_epoch: 5
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [32, 320]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 320] # input size 可以尝试[64,200]
|
||||
out_char_num: 40 # output char patch
|
||||
out_channels: 384 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
|
||||
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
prenorm: False
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/ch_scene
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 128
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/scene_test
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 2
|
|
@ -1,117 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 20
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_large_en/
|
||||
save_epoch_step: 1
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words_en/word_10.png
|
||||
# for data or label process
|
||||
character_dict_path:
|
||||
character_type: en
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: False
|
||||
save_res_path: ./output/rec/predicts_svtr_large.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.000125
|
||||
warmup_epoch: 2
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [48, 160]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [48, 160] # input size 可以尝试[64,200]
|
||||
out_char_num: 40 # output char patch
|
||||
out_channels: 384 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [192, 256, 512] # 三个阶段的sub-patch dim
|
||||
depth: [3, 9, 9] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [6, 8, 16] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
prenorm: false
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/training
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- RecAug:
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 128
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/evaluation/
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 128
|
||||
num_workers: 2
|
|
@ -1,114 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 100
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_small_ch/
|
||||
save_epoch_step: 10
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 40
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_svtr_small_ch.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.0003
|
||||
warmup_epoch: 5
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [32, 320]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 320] # input size 可以尝试[64,200]
|
||||
out_char_num: 40 # output char patch
|
||||
out_channels: 192 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
last_stage: True
|
||||
prenorm: False
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/ch_scene
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 128
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/scene_test
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 2
|
|
@ -1,117 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 20
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_small_stn_en/
|
||||
save_epoch_step: 1
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words_en/word_10.png
|
||||
# for data or label process
|
||||
character_dict_path:
|
||||
character_type: en
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: False
|
||||
save_res_path: ./output/rec/predicts_svtr_small.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.0005
|
||||
warmup_epoch: 2
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [32, 100]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 100] # input size 可以尝试[64,200]
|
||||
out_char_num: 25 # output char patch
|
||||
out_channels: 192 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [96, 192, 256] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 6] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [3, 6, 8] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
last_stage: True
|
||||
prenorm: False
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: SVTRLabelDecode # SVTRLabelDecode is used for eval, please change to CTCLabelDecode when training
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/training
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 512
|
||||
drop_last: True
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/data_lmdb_release/evaluation
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- SVTRRecResizeImg: # SVTRRecResizeImg is used for eval, please change to RecResizeImg when training
|
||||
character_dict_path:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 2
|
|
@ -1,114 +0,0 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: 100
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec/rec_svtr_tiny_ch/
|
||||
save_epoch_step: 10
|
||||
# evaluation is run every 2000 iterations after the 0th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 40
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_svtr_tiny_ch.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.99
|
||||
epsilon: 0.00000008
|
||||
weight_decay: 0.05
|
||||
no_weight_decay_name: norm pos_embed
|
||||
one_dim_param_no_weight_decay: true
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.0003
|
||||
warmup_epoch: 5
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: SVTR
|
||||
Transform:
|
||||
name: STN_ON
|
||||
tps_inputsize: [32, 64]
|
||||
tps_outputsize: [32, 320]
|
||||
num_control_points: 20
|
||||
tps_margins: [0.05,0.05]
|
||||
stn_activation: none
|
||||
Backbone:
|
||||
name: SVTRNet
|
||||
img_size: [32, 320] # input size
|
||||
out_char_num: 40 # number char patch
|
||||
out_channels: 192 # char patch dim
|
||||
patch_merging: 'Conv' # 是否使用patch-merging 可选Conv Pool None
|
||||
embed_dim: [64, 128, 256] # 三个阶段的sub-patch dim
|
||||
depth: [3, 6, 3] # 当使用patch-merging时,控制patch-merging所在的层数,分成三阶段,每个阶段的层数
|
||||
num_heads: [2, 4, 8] # 三个阶段中的sub-patch heads
|
||||
mixer: ['Local','Local','Local','Local','Local','Local','Global','Global','Global','Global','Global','Global'] # Local atten, Global atten, Conv
|
||||
local_mixer: [[7, 11], [7, 11], [7, 11]] # local mixer的范围,7表示高度的范围,11表示宽度的范围
|
||||
last_stage: True # 三个阶段中的sub-patch heads
|
||||
prenorm: false
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: reshape
|
||||
Head:
|
||||
name: CTCHead
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/ch_scene
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 128
|
||||
drop_last: True
|
||||
num_workers: 2
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: LMDBDataSet
|
||||
data_dir: ./train_data/scene_ch/scene_test
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 64, 256]
|
||||
padding: False
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 2
|
|
@ -137,7 +137,7 @@ def main(config, device, logger, vdl_writer):
|
|||
config['Optimizer'],
|
||||
epochs=config['Global']['epoch_num'],
|
||||
step_each_epoch=len(train_dataloader),
|
||||
parameters=model.parameters())
|
||||
model=model)
|
||||
|
||||
# resume PACT training process
|
||||
if config["Global"]["checkpoints"] is not None:
|
||||
|
|
|
@ -209,17 +209,16 @@ class PRENResizeImg(object):
|
|||
|
||||
class SVTRRecResizeImg(object):
|
||||
def __init__(self,
|
||||
image_shape,
|
||||
infer_mode=False,
|
||||
character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
|
||||
padding=True,
|
||||
**kwargs):
|
||||
image_shape,
|
||||
infer_mode=False,
|
||||
character_dict_path='./ppocr/utils/ppocr_keys_v1.txt',
|
||||
padding=True,
|
||||
**kwargs):
|
||||
self.image_shape = image_shape
|
||||
self.infer_mode = infer_mode
|
||||
self.character_dict_path = character_dict_path
|
||||
self.padding = padding
|
||||
|
||||
|
||||
def __call__(self, data):
|
||||
img = data['image']
|
||||
norm_img = resize_norm_img_svtr(img, self.image_shape, self.padding)
|
||||
|
@ -227,7 +226,6 @@ class SVTRRecResizeImg(object):
|
|||
return data
|
||||
|
||||
|
||||
|
||||
def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
|
||||
imgC, imgH, imgW_min, imgW_max = image_shape
|
||||
h = img.shape[0]
|
||||
|
@ -346,23 +344,21 @@ def resize_norm_img_srn(img, image_shape):
|
|||
return np.reshape(img_black, (c, row, col)).astype(np.float32)
|
||||
|
||||
|
||||
|
||||
def resize_norm_img_svtr(img, image_shape, padding=True):
|
||||
def resize_norm_img_svtr(img, image_shape, padding=False):
|
||||
imgC, imgH, imgW = image_shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
if not padding:
|
||||
|
||||
if h > 2.0 * w:
|
||||
image = Image.fromarray(img)
|
||||
image1 = image.rotate(90, expand=True)
|
||||
image2 = image.rotate(-90, expand=True)
|
||||
img1 = np.array(image1)
|
||||
img2 = np.array(image2)
|
||||
image = Image.fromarray(img)
|
||||
image1 = image.rotate(90, expand=True)
|
||||
image2 = image.rotate(-90, expand=True)
|
||||
img1 = np.array(image1)
|
||||
img2 = np.array(image2)
|
||||
else:
|
||||
img1 = copy.deepcopy(img)
|
||||
img2 = copy.deepcopy(img)
|
||||
|
||||
img1 = copy.deepcopy(img)
|
||||
img2 = copy.deepcopy(img)
|
||||
|
||||
resized_image = cv2.resize(
|
||||
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
|
||||
resized_image1 = cv2.resize(
|
||||
|
|
|
@ -296,47 +296,49 @@ class PatchEmbed(nn.Layer):
|
|||
if sub_num == 2:
|
||||
self.proj = nn.Sequential(
|
||||
ConvBNLayer(
|
||||
in_channels,
|
||||
embed_dim // 2,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
in_channels=in_channels,
|
||||
out_channels=embed_dim // 2,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
embed_dim // 2,
|
||||
embed_dim,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
in_channels=embed_dim // 2,
|
||||
out_channels=embed_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None))
|
||||
if sub_num == 3:
|
||||
self.proj = nn.Sequential(
|
||||
ConvBNLayer(
|
||||
in_channels,
|
||||
embed_dim // 4,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
in_channels=in_channels,
|
||||
out_channels=embed_dim // 4,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
embed_dim // 4,
|
||||
embed_dim // 2,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
in_channels=embed_dim // 4,
|
||||
out_channels=embed_dim // 2,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None),
|
||||
ConvBNLayer(
|
||||
embed_dim // 2,
|
||||
embed_dim,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
in_channels=embed_dim // 2,
|
||||
out_channels=embed_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=nn.GELU,
|
||||
bias_attr=None), )
|
||||
bias_attr=None))
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
|
|
Loading…
Reference in New Issue