add more dataset yamls and fix re exceptions (#6791)
* add more dataset yamls and fix re exceptionspull/6818/head
parent
5a0108b8ac
commit
7a99588dd8
|
@ -0,0 +1,125 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/re_layoutlmv2_funsd
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/FUNSD/testing_data/images/83624198.png
|
||||
save_res_path: ./output/re_layoutlmv2_funsd/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutLMv2"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutLMv2ForRe
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
|
||||
Loss:
|
||||
name: LossFromOutput
|
||||
key: loss
|
||||
reduction: mean
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
clip_norm: 10
|
||||
lr:
|
||||
learning_rate: 0.00005
|
||||
warmup_epoch: 10
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQAReTokenLayoutLMPostProcess
|
||||
|
||||
Metric:
|
||||
name: VQAReTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/training_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/train.json
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: True
|
||||
algorithm: *algorithm
|
||||
class_path: &class_path train_data/FUNSD/class_list.txt
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQAReTokenRelation:
|
||||
- VQAReTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 8
|
||||
collate_fn: ListCollator
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/testing_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/test.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: True
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQAReTokenRelation:
|
||||
- VQAReTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 8
|
||||
collate_fn: ListCollator
|
|
@ -3,16 +3,16 @@ Global:
|
|||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/re_layoutlmv2/
|
||||
save_model_dir: ./output/re_layoutlmv2_xfund_zh
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 19 ]
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2048
|
||||
infer_img: ppstructure/docs/vqa/input/zh_val_21.jpg
|
||||
save_res_path: ./output/re/
|
||||
save_res_path: ./output/re_layoutlmv2_xfund_zh/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
|
@ -0,0 +1,129 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/re_layoutxlm_funsd
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/FUNSD/testing_data/images/83624198.png
|
||||
save_res_path: ./output/re_layoutxlm_funsd/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutXLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutXLMForRe
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
|
||||
Loss:
|
||||
name: LossFromOutput
|
||||
key: loss
|
||||
reduction: mean
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
clip_norm: 10
|
||||
lr:
|
||||
learning_rate: 0.00005
|
||||
warmup_epoch: 10
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQAReTokenLayoutLMPostProcess
|
||||
|
||||
Metric:
|
||||
name: VQAReTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/training_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/train_v4.json
|
||||
# - ./train_data/FUNSD/train.json
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: True
|
||||
algorithm: *algorithm
|
||||
class_path: &class_path ./train_data/FUNSD/class_list.txt
|
||||
use_textline_bbox_info: &use_textline_bbox_info True
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQAReTokenRelation:
|
||||
- VQAReTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 16
|
||||
collate_fn: ListCollator
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/testing_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/test_v4.json
|
||||
# - ./train_data/FUNSD/test.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: True
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
use_textline_bbox_info: *use_textline_bbox_info
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQAReTokenRelation:
|
||||
- VQAReTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'entities', 'relations']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 8
|
||||
collate_fn: ListCollator
|
|
@ -0,0 +1,124 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlm_funsd
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/FUNSD/testing_data/images/83624198.png
|
||||
save_res_path: ./output/ser_layoutlm_funsd/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutLMForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 7
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/FUNSD/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/training_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/train.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
use_textline_bbox_info: &use_textline_bbox_info True
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/FUNSD/testing_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/test.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
use_textline_bbox_info: *use_textline_bbox_info
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -0,0 +1,124 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlm_sroie
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 200 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/SROIE/test/X00016469670.jpg
|
||||
save_res_path: ./output/ser_layoutlm_sroie/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutLMForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 9
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/SROIE/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/SROIE/train
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/train.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
use_textline_bbox_info: &use_textline_bbox_info True
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/SROIE/test
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/test.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
use_textline_bbox_info: *use_textline_bbox_info
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -3,16 +3,16 @@ Global:
|
|||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlm/
|
||||
save_model_dir: ./output/ser_layoutlm_xfund_zh
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 19 ]
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
|
||||
save_res_path: ./output/ser/
|
||||
save_res_path: ./output/ser_layoutlm_xfund_zh/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
|
@ -0,0 +1,123 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlmv2_funsd
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 100 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/FUNSD/testing_data/images/83624198.png
|
||||
save_res_path: ./output/ser_layoutlmv2_funsd/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutLMv2"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutLMv2ForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 7
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path train_data/FUNSD/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/training_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/train.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/testing_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/test.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -0,0 +1,123 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlmv2_sroie
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 200 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/SROIE/test/X00016469670.jpg
|
||||
save_res_path: ./output/ser_layoutlmv2_sroie/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutLMv2"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutLMv2ForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 9
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/SROIE/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/SROIE/train
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/train.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/SROIE/test
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/test.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -3,7 +3,7 @@ Global:
|
|||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutlmv2/
|
||||
save_model_dir: ./output/ser_layoutlmv2_xfund_zh/
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 19 ]
|
||||
|
@ -12,7 +12,7 @@ Global:
|
|||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
|
||||
save_res_path: ./output/ser/
|
||||
save_res_path: ./output/ser_layoutlmv2_xfund_zh/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
|
@ -0,0 +1,123 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutxlm_funsd
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 57 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/FUNSD/testing_data/images/83624198.png
|
||||
save_res_path: output/ser_layoutxlm_funsd/res/
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutXLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutXLMForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 7
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/FUNSD/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/FUNSD/training_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/train.json
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/FUNSD/testing_data/images/
|
||||
label_file_list:
|
||||
- ./train_data/FUNSD/test.json
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -0,0 +1,123 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutxlm_sroie
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 200 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data/SROIE/test/X00016469670.jpg
|
||||
save_res_path: res_img_aug_with_gt
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutXLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutXLMForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 9
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/SROIE/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/SROIE/train
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/train.txt
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/SROIE/test
|
||||
label_file_list:
|
||||
- ./train_data/SROIE/test.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -0,0 +1,123 @@
|
|||
Global:
|
||||
use_gpu: True
|
||||
epoch_num: &epoch_num 100
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutxlm_wildreceipt
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 200 ]
|
||||
cal_metric_during_train: False
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: train_data//wildreceipt/image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg
|
||||
save_res_path: ./output/ser_layoutxlm_wildreceipt/res
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
||||
algorithm: &algorithm "LayoutXLM"
|
||||
Transform:
|
||||
Backbone:
|
||||
name: LayoutXLMForSer
|
||||
pretrained: True
|
||||
checkpoints:
|
||||
num_classes: &num_classes 51
|
||||
|
||||
Loss:
|
||||
name: VQASerTokenLayoutLMLoss
|
||||
num_classes: *num_classes
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Linear
|
||||
learning_rate: 0.00005
|
||||
epochs: *epoch_num
|
||||
warmup_epoch: 2
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 0.00000
|
||||
|
||||
PostProcess:
|
||||
name: VQASerTokenLayoutLMPostProcess
|
||||
class_path: &class_path ./train_data/wildreceipt/class_list.txt
|
||||
|
||||
Metric:
|
||||
name: VQASerTokenMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/wildreceipt/
|
||||
label_file_list:
|
||||
- ./train_data/wildreceipt/wildreceipt_train.txt
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: &max_seq_len 512
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/wildreceipt
|
||||
label_file_list:
|
||||
- ./train_data/wildreceipt/wildreceipt_test.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: RGB
|
||||
channel_first: False
|
||||
- VQATokenLabelEncode: # Class handling label
|
||||
contains_re: False
|
||||
algorithm: *algorithm
|
||||
class_path: *class_path
|
||||
- VQATokenPad:
|
||||
max_seq_len: *max_seq_len
|
||||
return_attention_mask: True
|
||||
- VQASerTokenChunk:
|
||||
max_seq_len: *max_seq_len
|
||||
- Resize:
|
||||
size: [224,224]
|
||||
- NormalizeImage:
|
||||
scale: 1
|
||||
mean: [ 123.675, 116.28, 103.53 ]
|
||||
std: [ 58.395, 57.12, 57.375 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
# dataloader will return list in this order
|
||||
keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 8
|
||||
num_workers: 4
|
|
@ -3,7 +3,7 @@ Global:
|
|||
epoch_num: &epoch_num 200
|
||||
log_smooth_window: 10
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/ser_layoutxlm/
|
||||
save_model_dir: ./output/ser_layoutxlm_xfund_zh
|
||||
save_epoch_step: 2000
|
||||
# evaluation is run every 10 iterations after the 0th iteration
|
||||
eval_batch_step: [ 0, 19 ]
|
||||
|
@ -12,7 +12,7 @@ Global:
|
|||
use_visualdl: False
|
||||
seed: 2022
|
||||
infer_img: ppstructure/docs/vqa/input/zh_val_42.jpg
|
||||
save_res_path: ./output/ser
|
||||
save_res_path: ./output/ser_layoutxlm_xfund_zh/res
|
||||
|
||||
Architecture:
|
||||
model_type: vqa
|
|
@ -869,6 +869,7 @@ class VQATokenLabelEncode(object):
|
|||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
algorithm='LayoutXLM',
|
||||
use_textline_bbox_info=True,
|
||||
infer_mode=False,
|
||||
ocr_engine=None,
|
||||
**kwargs):
|
||||
|
@ -897,11 +898,51 @@ class VQATokenLabelEncode(object):
|
|||
self.add_special_ids = add_special_ids
|
||||
self.infer_mode = infer_mode
|
||||
self.ocr_engine = ocr_engine
|
||||
self.use_textline_bbox_info = use_textline_bbox_info
|
||||
|
||||
def split_bbox(self, bbox, text, tokenizer):
|
||||
words = text.split()
|
||||
token_bboxes = []
|
||||
curr_word_idx = 0
|
||||
x1, y1, x2, y2 = bbox
|
||||
unit_w = (x2 - x1) / len(text)
|
||||
for idx, word in enumerate(words):
|
||||
curr_w = len(word) * unit_w
|
||||
word_bbox = [x1, y1, x1 + curr_w, y2]
|
||||
token_bboxes.extend([word_bbox] * len(tokenizer.tokenize(word)))
|
||||
x1 += (len(word) + 1) * unit_w
|
||||
return token_bboxes
|
||||
|
||||
def filter_empty_contents(self, ocr_info):
|
||||
"""
|
||||
find out the empty texts and remove the links
|
||||
"""
|
||||
new_ocr_info = []
|
||||
empty_index = []
|
||||
for idx, info in enumerate(ocr_info):
|
||||
if len(info["transcription"]) > 0:
|
||||
new_ocr_info.append(copy.deepcopy(info))
|
||||
else:
|
||||
empty_index.append(info["id"])
|
||||
|
||||
for idx, info in enumerate(new_ocr_info):
|
||||
new_link = []
|
||||
for link in info["linking"]:
|
||||
if link[0] in empty_index or link[1] in empty_index:
|
||||
continue
|
||||
new_link.append(link)
|
||||
new_ocr_info[idx]["linking"] = new_link
|
||||
return new_ocr_info
|
||||
|
||||
def __call__(self, data):
|
||||
# load bbox and label info
|
||||
ocr_info = self._load_ocr_info(data)
|
||||
|
||||
# for re
|
||||
train_re = self.contains_re and not self.infer_mode
|
||||
if train_re:
|
||||
ocr_info = self.filter_empty_contents(ocr_info)
|
||||
|
||||
height, width, _ = data['image'].shape
|
||||
|
||||
words_list = []
|
||||
|
@ -913,8 +954,6 @@ class VQATokenLabelEncode(object):
|
|||
|
||||
entities = []
|
||||
|
||||
# for re
|
||||
train_re = self.contains_re and not self.infer_mode
|
||||
if train_re:
|
||||
relations = []
|
||||
id2label = {}
|
||||
|
@ -924,18 +963,19 @@ class VQATokenLabelEncode(object):
|
|||
data['ocr_info'] = copy.deepcopy(ocr_info)
|
||||
|
||||
for info in ocr_info:
|
||||
text = info["transcription"]
|
||||
if len(text) <= 0:
|
||||
continue
|
||||
if train_re:
|
||||
# for re
|
||||
if len(info["transcription"]) == 0:
|
||||
if len(text) == 0:
|
||||
empty_entity.add(info["id"])
|
||||
continue
|
||||
id2label[info["id"]] = info["label"]
|
||||
relations.extend([tuple(sorted(l)) for l in info["linking"]])
|
||||
# smooth_box
|
||||
info["bbox"] = self.trans_poly_to_bbox(info["points"])
|
||||
bbox = self._smooth_box(info["bbox"], height, width)
|
||||
|
||||
text = info["transcription"]
|
||||
encode_res = self.tokenizer.encode(
|
||||
text, pad_to_max_seq_len=False, return_attention_mask=True)
|
||||
|
||||
|
@ -946,6 +986,19 @@ class VQATokenLabelEncode(object):
|
|||
-1]
|
||||
encode_res["attention_mask"] = encode_res["attention_mask"][1:
|
||||
-1]
|
||||
|
||||
if self.use_textline_bbox_info:
|
||||
bbox = [info["bbox"]] * len(encode_res["input_ids"])
|
||||
else:
|
||||
bbox = self.split_bbox(info["bbox"], info["transcription"],
|
||||
self.tokenizer)
|
||||
if len(bbox) <= 0:
|
||||
continue
|
||||
bbox = self._smooth_box(bbox, height, width)
|
||||
if self.add_special_ids:
|
||||
bbox.insert(0, [0, 0, 0, 0])
|
||||
bbox.append([0, 0, 0, 0])
|
||||
|
||||
# parse label
|
||||
if not self.infer_mode:
|
||||
label = info['label']
|
||||
|
@ -970,7 +1023,7 @@ class VQATokenLabelEncode(object):
|
|||
})
|
||||
input_ids_list.extend(encode_res["input_ids"])
|
||||
token_type_ids_list.extend(encode_res["token_type_ids"])
|
||||
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
|
||||
bbox_list.extend(bbox)
|
||||
words_list.append(text)
|
||||
segment_offset_id.append(len(input_ids_list))
|
||||
if not self.infer_mode:
|
||||
|
@ -1019,12 +1072,14 @@ class VQATokenLabelEncode(object):
|
|||
info_dict = json.loads(info)
|
||||
return info_dict
|
||||
|
||||
def _smooth_box(self, bbox, height, width):
|
||||
bbox[0] = int(bbox[0] * 1000.0 / width)
|
||||
bbox[2] = int(bbox[2] * 1000.0 / width)
|
||||
bbox[1] = int(bbox[1] * 1000.0 / height)
|
||||
bbox[3] = int(bbox[3] * 1000.0 / height)
|
||||
return bbox
|
||||
def _smooth_box(self, bboxes, height, width):
|
||||
bboxes = np.array(bboxes)
|
||||
bboxes[:, 0] = bboxes[:, 0] * 1000 / width
|
||||
bboxes[:, 2] = bboxes[:, 2] * 1000 / width
|
||||
bboxes[:, 1] = bboxes[:, 1] * 1000 / height
|
||||
bboxes[:, 3] = bboxes[:, 3] * 1000 / height
|
||||
bboxes = bboxes.astype("int64").tolist()
|
||||
return bboxes
|
||||
|
||||
def _parse_label(self, label, encode_res):
|
||||
gt_label = []
|
||||
|
|
|
@ -37,23 +37,26 @@ class VQAReTokenMetric(object):
|
|||
gt_relations = []
|
||||
for b in range(len(self.relations_list)):
|
||||
rel_sent = []
|
||||
for head, tail in zip(self.relations_list[b]["head"],
|
||||
self.relations_list[b]["tail"]):
|
||||
rel = {}
|
||||
rel["head_id"] = head
|
||||
rel["head"] = (self.entities_list[b]["start"][rel["head_id"]],
|
||||
self.entities_list[b]["end"][rel["head_id"]])
|
||||
rel["head_type"] = self.entities_list[b]["label"][rel[
|
||||
"head_id"]]
|
||||
if "head" in self.relations_list[b]:
|
||||
for head, tail in zip(self.relations_list[b]["head"],
|
||||
self.relations_list[b]["tail"]):
|
||||
rel = {}
|
||||
rel["head_id"] = head
|
||||
rel["head"] = (
|
||||
self.entities_list[b]["start"][rel["head_id"]],
|
||||
self.entities_list[b]["end"][rel["head_id"]])
|
||||
rel["head_type"] = self.entities_list[b]["label"][rel[
|
||||
"head_id"]]
|
||||
|
||||
rel["tail_id"] = tail
|
||||
rel["tail"] = (self.entities_list[b]["start"][rel["tail_id"]],
|
||||
self.entities_list[b]["end"][rel["tail_id"]])
|
||||
rel["tail_type"] = self.entities_list[b]["label"][rel[
|
||||
"tail_id"]]
|
||||
rel["tail_id"] = tail
|
||||
rel["tail"] = (
|
||||
self.entities_list[b]["start"][rel["tail_id"]],
|
||||
self.entities_list[b]["end"][rel["tail_id"]])
|
||||
rel["tail_type"] = self.entities_list[b]["label"][rel[
|
||||
"tail_id"]]
|
||||
|
||||
rel["type"] = 1
|
||||
rel_sent.append(rel)
|
||||
rel["type"] = 1
|
||||
rel_sent.append(rel)
|
||||
gt_relations.append(rel_sent)
|
||||
re_metrics = self.re_score(
|
||||
self.pred_relations_list, gt_relations, mode="boundaries")
|
||||
|
|
|
@ -43,9 +43,11 @@ class NLPBaseModel(nn.Layer):
|
|||
super(NLPBaseModel, self).__init__()
|
||||
if checkpoints is not None:
|
||||
self.model = model_class.from_pretrained(checkpoints)
|
||||
elif isinstance(pretrained, (str, )) and os.path.exists(pretrained):
|
||||
self.model = model_class.from_pretrained(pretrained)
|
||||
else:
|
||||
pretrained_model_name = pretrained_model_dict[base_model_class]
|
||||
if pretrained:
|
||||
if pretrained is True:
|
||||
base_model = base_model_class.from_pretrained(
|
||||
pretrained_model_name)
|
||||
else:
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import cv2
|
||||
import numpy as np
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
def trans_poly_to_bbox(poly):
|
||||
x1 = np.min([p[0] for p in poly])
|
||||
x2 = np.max([p[0] for p in poly])
|
||||
y1 = np.min([p[1] for p in poly])
|
||||
y2 = np.max([p[1] for p in poly])
|
||||
return [x1, y1, x2, y2]
|
||||
|
||||
|
||||
def get_outer_poly(bbox_list):
|
||||
x1 = min([bbox[0] for bbox in bbox_list])
|
||||
y1 = min([bbox[1] for bbox in bbox_list])
|
||||
x2 = max([bbox[2] for bbox in bbox_list])
|
||||
y2 = max([bbox[3] for bbox in bbox_list])
|
||||
return [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
||||
|
||||
|
||||
def load_funsd_label(image_dir, anno_dir):
|
||||
imgs = os.listdir(image_dir)
|
||||
annos = os.listdir(anno_dir)
|
||||
|
||||
imgs = [img.replace(".png", "") for img in imgs]
|
||||
annos = [anno.replace(".json", "") for anno in annos]
|
||||
|
||||
fn_info_map = dict()
|
||||
for anno_fn in annos:
|
||||
res = []
|
||||
with open(os.path.join(anno_dir, anno_fn + ".json"), "r") as fin:
|
||||
infos = json.load(fin)
|
||||
infos = infos["form"]
|
||||
old_id2new_id_map = dict()
|
||||
global_new_id = 0
|
||||
for info in infos:
|
||||
if info["text"] is None:
|
||||
continue
|
||||
words = info["words"]
|
||||
if len(words) <= 0:
|
||||
continue
|
||||
word_idx = 1
|
||||
curr_bboxes = [words[0]["box"]]
|
||||
curr_texts = [words[0]["text"]]
|
||||
while word_idx < len(words):
|
||||
# switch to a new link
|
||||
if words[word_idx]["box"][0] + 10 <= words[word_idx - 1][
|
||||
"box"][2]:
|
||||
if len("".join(curr_texts[0])) > 0:
|
||||
res.append({
|
||||
"transcription": " ".join(curr_texts),
|
||||
"label": info["label"],
|
||||
"points": get_outer_poly(curr_bboxes),
|
||||
"linking": info["linking"],
|
||||
"id": global_new_id,
|
||||
})
|
||||
if info["id"] not in old_id2new_id_map:
|
||||
old_id2new_id_map[info["id"]] = []
|
||||
old_id2new_id_map[info["id"]].append(global_new_id)
|
||||
global_new_id += 1
|
||||
curr_bboxes = [words[word_idx]["box"]]
|
||||
curr_texts = [words[word_idx]["text"]]
|
||||
else:
|
||||
curr_bboxes.append(words[word_idx]["box"])
|
||||
curr_texts.append(words[word_idx]["text"])
|
||||
word_idx += 1
|
||||
if len("".join(curr_texts[0])) > 0:
|
||||
res.append({
|
||||
"transcription": " ".join(curr_texts),
|
||||
"label": info["label"],
|
||||
"points": get_outer_poly(curr_bboxes),
|
||||
"linking": info["linking"],
|
||||
"id": global_new_id,
|
||||
})
|
||||
if info["id"] not in old_id2new_id_map:
|
||||
old_id2new_id_map[info["id"]] = []
|
||||
old_id2new_id_map[info["id"]].append(global_new_id)
|
||||
global_new_id += 1
|
||||
res = sorted(
|
||||
res, key=lambda r: (r["points"][0][1], r["points"][0][0]))
|
||||
for i in range(len(res) - 1):
|
||||
for j in range(i, 0, -1):
|
||||
if abs(res[j + 1]["points"][0][1] - res[j]["points"][0][1]) < 20 and \
|
||||
(res[j + 1]["points"][0][0] < res[j]["points"][0][0]):
|
||||
tmp = deepcopy(res[j])
|
||||
res[j] = deepcopy(res[j + 1])
|
||||
res[j + 1] = deepcopy(tmp)
|
||||
else:
|
||||
break
|
||||
# re-generate unique ids
|
||||
for idx, r in enumerate(res):
|
||||
new_links = []
|
||||
for link in r["linking"]:
|
||||
# illegal links will be removed
|
||||
if link[0] not in old_id2new_id_map or link[
|
||||
1] not in old_id2new_id_map:
|
||||
continue
|
||||
for src in old_id2new_id_map[link[0]]:
|
||||
for dst in old_id2new_id_map[link[1]]:
|
||||
new_links.append([src, dst])
|
||||
res[idx]["linking"] = deepcopy(new_links)
|
||||
|
||||
fn_info_map[anno_fn] = res
|
||||
|
||||
return fn_info_map
|
||||
|
||||
|
||||
def main():
|
||||
test_image_dir = "train_data/FUNSD/testing_data/images/"
|
||||
test_anno_dir = "train_data/FUNSD/testing_data/annotations/"
|
||||
test_output_dir = "train_data/FUNSD/test.json"
|
||||
|
||||
fn_info_map = load_funsd_label(test_image_dir, test_anno_dir)
|
||||
with open(test_output_dir, "w") as fout:
|
||||
for fn in fn_info_map:
|
||||
fout.write(fn + ".png" + "\t" + json.dumps(
|
||||
fn_info_map[fn], ensure_ascii=False) + "\n")
|
||||
|
||||
train_image_dir = "train_data/FUNSD/training_data/images/"
|
||||
train_anno_dir = "train_data/FUNSD/training_data/annotations/"
|
||||
train_output_dir = "train_data/FUNSD/train.json"
|
||||
|
||||
fn_info_map = load_funsd_label(train_image_dir, train_anno_dir)
|
||||
with open(train_output_dir, "w") as fout:
|
||||
for fn in fn_info_map:
|
||||
fout.write(fn + ".png" + "\t" + json.dumps(
|
||||
fn_info_map[fn], ensure_ascii=False) + "\n")
|
||||
print("====ok====")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue