parent
8cd341095e
commit
5c6f90d6f7
|
@ -98,7 +98,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
|
|||
# 需要使用PaddleNLP最新的代码版本进行安装
|
||||
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
|
||||
cd PaddleNLP
|
||||
pip install -e .
|
||||
pip3 install -e .
|
||||
```
|
||||
|
||||
|
||||
|
@ -141,7 +141,6 @@ python3.7 train_ser.py \
|
|||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--num_train_epochs 200 \
|
||||
--eval_steps 10 \
|
||||
--save_steps 500 \
|
||||
--output_dir "./output/ser/" \
|
||||
--learning_rate 5e-5 \
|
||||
--warmup_steps 50 \
|
||||
|
@ -151,6 +150,48 @@ python3.7 train_ser.py \
|
|||
|
||||
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。
|
||||
|
||||
* 恢复训练
|
||||
|
||||
```shell
|
||||
python3.7 train_ser.py \
|
||||
--model_name_or_path "model_path" \
|
||||
--train_data_dir "XFUND/zh_train/image" \
|
||||
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
|
||||
--eval_data_dir "XFUND/zh_val/image" \
|
||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--num_train_epochs 200 \
|
||||
--eval_steps 10 \
|
||||
--output_dir "./output/ser/" \
|
||||
--learning_rate 5e-5 \
|
||||
--warmup_steps 50 \
|
||||
--evaluate_during_training \
|
||||
--seed 2048 \
|
||||
--resume
|
||||
```
|
||||
|
||||
* 评估
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3 eval_ser.py \
|
||||
--model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \
|
||||
--eval_data_dir "XFUND/zh_val/image" \
|
||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--per_gpu_eval_batch_size 8 \
|
||||
--output_dir "output/ser/" \
|
||||
--seed 2048
|
||||
```
|
||||
最终会打印出`precision`, `recall`, `f1`等指标
|
||||
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3.7 infer_ser.py \
|
||||
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
||||
--output_dir "output_res/" \
|
||||
--infer_imgs "XFUND/zh_val/image/" \
|
||||
--ocr_json_path "XFUND/zh_val/xfun_normalize_val.json"
|
||||
```
|
||||
|
||||
* 使用评估集合中提供的OCR识别结果进行预测
|
||||
|
||||
```shell
|
||||
|
@ -188,6 +229,7 @@ python3.7 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_nor
|
|||
* 启动训练
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3 train_re.py \
|
||||
--model_name_or_path "layoutxlm-base-uncased" \
|
||||
--train_data_dir "XFUND/zh_train/image" \
|
||||
|
@ -197,7 +239,6 @@ python3 train_re.py \
|
|||
--label_map_path 'labels/labels_ser.txt' \
|
||||
--num_train_epochs 2 \
|
||||
--eval_steps 10 \
|
||||
--save_steps 500 \
|
||||
--output_dir "output/re/" \
|
||||
--learning_rate 5e-5 \
|
||||
--warmup_steps 50 \
|
||||
|
@ -208,8 +249,48 @@ python3 train_re.py \
|
|||
|
||||
```
|
||||
|
||||
* 恢复训练
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3 train_re.py \
|
||||
--model_name_or_path "model_path" \
|
||||
--train_data_dir "XFUND/zh_train/image" \
|
||||
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
|
||||
--eval_data_dir "XFUND/zh_val/image" \
|
||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--label_map_path 'labels/labels_ser.txt' \
|
||||
--num_train_epochs 2 \
|
||||
--eval_steps 10 \
|
||||
--output_dir "output/re/" \
|
||||
--learning_rate 5e-5 \
|
||||
--warmup_steps 50 \
|
||||
--per_gpu_train_batch_size 8 \
|
||||
--per_gpu_eval_batch_size 8 \
|
||||
--evaluate_during_training \
|
||||
--seed 2048 \
|
||||
--resume
|
||||
|
||||
```
|
||||
|
||||
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。
|
||||
|
||||
* 评估
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3 eval_re.py \
|
||||
--model_name_or_path "output/check/checkpoint-best" \
|
||||
--max_seq_length 512 \
|
||||
--eval_data_dir "XFUND/zh_val/image" \
|
||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--label_map_path 'labels/labels_ser.txt' \
|
||||
--output_dir "output/re_test/" \
|
||||
--per_gpu_eval_batch_size 8 \
|
||||
--seed 2048
|
||||
```
|
||||
最终会打印出`precision`, `recall`, `f1`等指标
|
||||
|
||||
|
||||
* 使用评估集合中提供的OCR识别结果进行预测
|
||||
|
||||
```shell
|
||||
|
@ -231,7 +312,7 @@ python3 infer_re.py \
|
|||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
# python3.7 infer_ser_re_e2e.py \
|
||||
python3.7 infer_ser_re_e2e.py \
|
||||
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
||||
--re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \
|
||||
--max_seq_length 512 \
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import paddle
|
||||
|
||||
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
|
||||
|
||||
from xfun import XFUNDataset
|
||||
from utils import parse_args, get_bio_label_maps, print_arguments
|
||||
from data_collator import DataCollator
|
||||
from metric import re_score
|
||||
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
def cal_metric(re_preds, re_labels, entities):
|
||||
gt_relations = []
|
||||
for b in range(len(re_labels)):
|
||||
rel_sent = []
|
||||
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
|
||||
rel = {}
|
||||
rel["head_id"] = head
|
||||
rel["head"] = (entities[b]["start"][rel["head_id"]],
|
||||
entities[b]["end"][rel["head_id"]])
|
||||
rel["head_type"] = entities[b]["label"][rel["head_id"]]
|
||||
|
||||
rel["tail_id"] = tail
|
||||
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
|
||||
entities[b]["end"][rel["tail_id"]])
|
||||
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
|
||||
|
||||
rel["type"] = 1
|
||||
rel_sent.append(rel)
|
||||
gt_relations.append(rel_sent)
|
||||
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
|
||||
return re_metrics
|
||||
|
||||
|
||||
def evaluate(model, eval_dataloader, logger, prefix=""):
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
|
||||
|
||||
re_preds = []
|
||||
re_labels = []
|
||||
entities = []
|
||||
eval_loss = 0.0
|
||||
model.eval()
|
||||
for idx, batch in enumerate(eval_dataloader):
|
||||
with paddle.no_grad():
|
||||
outputs = model(**batch)
|
||||
loss = outputs['loss'].mean().item()
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
|
||||
idx, len(eval_dataloader), loss))
|
||||
|
||||
eval_loss += loss
|
||||
re_preds.extend(outputs['pred_relations'])
|
||||
re_labels.extend(batch['relations'])
|
||||
entities.extend(batch['entities'])
|
||||
re_metrics = cal_metric(re_preds, re_labels, entities)
|
||||
re_metrics = {
|
||||
"precision": re_metrics["ALL"]["p"],
|
||||
"recall": re_metrics["ALL"]["r"],
|
||||
"f1": re_metrics["ALL"]["f1"],
|
||||
}
|
||||
model.train()
|
||||
return re_metrics
|
||||
|
||||
|
||||
def eval(args):
|
||||
logger = get_logger()
|
||||
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
|
||||
model = LayoutXLMForRelationExtraction.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
|
||||
eval_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.eval_data_dir,
|
||||
label_path=args.eval_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
max_seq_len=args.max_seq_length,
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=True,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
eval_dataloader = paddle.io.DataLoader(
|
||||
eval_dataset,
|
||||
batch_size=args.per_gpu_eval_batch_size,
|
||||
num_workers=8,
|
||||
shuffle=False,
|
||||
collate_fn=DataCollator())
|
||||
|
||||
results = evaluate(model, eval_dataloader, logger)
|
||||
logger.info("eval results: {}".format(results))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
eval(args)
|
|
@ -0,0 +1,154 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import random
|
||||
import time
|
||||
import copy
|
||||
import logging
|
||||
|
||||
import argparse
|
||||
import paddle
|
||||
import numpy as np
|
||||
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
from xfun import XFUNDataset
|
||||
from utils import parse_args, get_bio_label_maps, print_arguments
|
||||
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
def eval(args):
|
||||
logger = get_logger()
|
||||
print_arguments(args, logger)
|
||||
|
||||
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
|
||||
eval_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.eval_data_dir,
|
||||
label_path=args.eval_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
eval_dataloader = paddle.io.DataLoader(
|
||||
eval_dataset,
|
||||
batch_size=args.per_gpu_eval_batch_size,
|
||||
num_workers=0,
|
||||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
results, _ = evaluate(args, model, tokenizer, eval_dataloader, label2id_map,
|
||||
id2label_map, pad_token_label_id, logger)
|
||||
|
||||
logger.info(results)
|
||||
|
||||
|
||||
def evaluate(args,
|
||||
model,
|
||||
tokenizer,
|
||||
eval_dataloader,
|
||||
label2id_map,
|
||||
id2label_map,
|
||||
pad_token_label_id,
|
||||
logger,
|
||||
prefix=""):
|
||||
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
preds = None
|
||||
out_label_ids = None
|
||||
model.eval()
|
||||
for idx, batch in enumerate(eval_dataloader):
|
||||
with paddle.no_grad():
|
||||
outputs = model(**batch)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
tmp_eval_loss = tmp_eval_loss.mean()
|
||||
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
|
||||
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
|
||||
|
||||
eval_loss += tmp_eval_loss.item()
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.numpy()
|
||||
out_label_ids = batch["labels"].numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.numpy(), axis=0)
|
||||
out_label_ids = np.append(
|
||||
out_label_ids, batch["labels"].numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
preds = np.argmax(preds, axis=2)
|
||||
|
||||
# label_map = {i: label.upper() for i, label in enumerate(labels)}
|
||||
|
||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
|
||||
for i in range(out_label_ids.shape[0]):
|
||||
for j in range(out_label_ids.shape[1]):
|
||||
if out_label_ids[i, j] != pad_token_label_id:
|
||||
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
|
||||
preds_list[i].append(id2label_map[preds[i][j]])
|
||||
|
||||
results = {
|
||||
"loss": eval_loss,
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
|
||||
for lbl in out_label_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
|
||||
for lbl in preds_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
|
||||
report = classification_report(out_label_list, preds_list)
|
||||
logger.info("\n" + report)
|
||||
|
||||
logger.info("***** Eval results %s *****", prefix)
|
||||
for key in sorted(results.keys()):
|
||||
logger.info(" %s = %s", key, str(results[key]))
|
||||
model.train()
|
||||
return results, preds_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
eval(args)
|
|
@ -24,9 +24,9 @@ import paddle
|
|||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
|
||||
# relative reference
|
||||
from .utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
|
||||
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
|
||||
|
||||
from .utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
|
||||
from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
|
||||
|
||||
|
||||
def trans_poly_to_bbox(poly):
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
sentencepiece
|
||||
yacs
|
||||
seqeval
|
|
@ -20,80 +20,20 @@ sys.path.append(__dir__)
|
|||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import random
|
||||
import time
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
|
||||
|
||||
from xfun import XFUNDataset
|
||||
from utils import parse_args, get_bio_label_maps, print_arguments
|
||||
from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
|
||||
from data_collator import DataCollator
|
||||
from metric import re_score
|
||||
from eval_re import evaluate
|
||||
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
def set_seed(seed):
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
paddle.seed(seed)
|
||||
|
||||
|
||||
def cal_metric(re_preds, re_labels, entities):
|
||||
gt_relations = []
|
||||
for b in range(len(re_labels)):
|
||||
rel_sent = []
|
||||
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
|
||||
rel = {}
|
||||
rel["head_id"] = head
|
||||
rel["head"] = (entities[b]["start"][rel["head_id"]],
|
||||
entities[b]["end"][rel["head_id"]])
|
||||
rel["head_type"] = entities[b]["label"][rel["head_id"]]
|
||||
|
||||
rel["tail_id"] = tail
|
||||
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
|
||||
entities[b]["end"][rel["tail_id"]])
|
||||
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
|
||||
|
||||
rel["type"] = 1
|
||||
rel_sent.append(rel)
|
||||
gt_relations.append(rel_sent)
|
||||
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
|
||||
return re_metrics
|
||||
|
||||
|
||||
def evaluate(model, eval_dataloader, logger, prefix=""):
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
|
||||
|
||||
re_preds = []
|
||||
re_labels = []
|
||||
entities = []
|
||||
eval_loss = 0.0
|
||||
model.eval()
|
||||
for idx, batch in enumerate(eval_dataloader):
|
||||
with paddle.no_grad():
|
||||
outputs = model(**batch)
|
||||
loss = outputs['loss'].mean().item()
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
|
||||
idx, len(eval_dataloader), loss))
|
||||
|
||||
eval_loss += loss
|
||||
re_preds.extend(outputs['pred_relations'])
|
||||
re_labels.extend(batch['relations'])
|
||||
entities.extend(batch['entities'])
|
||||
re_metrics = cal_metric(re_preds, re_labels, entities)
|
||||
re_metrics = {
|
||||
"precision": re_metrics["ALL"]["p"],
|
||||
"recall": re_metrics["ALL"]["r"],
|
||||
"f1": re_metrics["ALL"]["f1"],
|
||||
}
|
||||
model.train()
|
||||
return re_metrics
|
||||
|
||||
|
||||
def train(args):
|
||||
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
||||
print_arguments(args, logger)
|
||||
|
@ -109,9 +49,14 @@ def train(args):
|
|||
paddle.distributed.init_parallel_env()
|
||||
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
|
||||
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForRelationExtraction(model, dropout=None)
|
||||
if not args.resume:
|
||||
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForRelationExtraction(model, dropout=None)
|
||||
logger.info('train from scratch')
|
||||
else:
|
||||
logger.info('resume from {}'.format(args.model_name_or_path))
|
||||
model = LayoutXLMForRelationExtraction.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
|
||||
# dist mode
|
||||
if paddle.distributed.get_world_size() > 1:
|
||||
|
@ -200,24 +145,45 @@ def train(args):
|
|||
best_metirc = {'f1': 0}
|
||||
model.train()
|
||||
|
||||
train_reader_cost = 0.0
|
||||
train_run_cost = 0.0
|
||||
total_samples = 0
|
||||
reader_start = time.time()
|
||||
|
||||
print_step = 1
|
||||
|
||||
for epoch in range(int(args.num_train_epochs)):
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
train_reader_cost += time.time() - reader_start
|
||||
train_start = time.time()
|
||||
outputs = model(**batch)
|
||||
train_run_cost += time.time() - train_start
|
||||
# model outputs are always tuple in ppnlp (see doc)
|
||||
loss = outputs['loss']
|
||||
loss = loss.mean()
|
||||
|
||||
logger.info(
|
||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
|
||||
format(epoch, args.num_train_epochs, step, train_dataloader_len,
|
||||
global_step, np.mean(loss.numpy()), optimizer.get_lr()))
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.clear_grad()
|
||||
# lr_scheduler.step() # Update learning rate schedule
|
||||
|
||||
global_step += 1
|
||||
total_samples += batch['image'].shape[0]
|
||||
|
||||
if step % print_step == 0:
|
||||
logger.info(
|
||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
|
||||
format(epoch, args.num_train_epochs, step,
|
||||
train_dataloader_len, global_step,
|
||||
np.mean(loss.numpy()),
|
||||
optimizer.get_lr(), train_reader_cost / print_step, (
|
||||
train_reader_cost + train_run_cost) / print_step,
|
||||
total_samples / print_step, total_samples / (
|
||||
train_reader_cost + train_run_cost)))
|
||||
|
||||
train_reader_cost = 0.0
|
||||
train_run_cost = 0.0
|
||||
total_samples = 0
|
||||
|
||||
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
||||
global_step % args.eval_steps == 0):
|
||||
|
@ -225,10 +191,9 @@ def train(args):
|
|||
if (paddle.distributed.get_rank() == 0 and args.
|
||||
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(model, eval_dataloader, logger)
|
||||
if results['f1'] > best_metirc['f1']:
|
||||
if results['f1'] >= best_metirc['f1']:
|
||||
best_metirc = results
|
||||
output_dir = os.path.join(args.output_dir,
|
||||
"checkpoint-best")
|
||||
output_dir = os.path.join(args.output_dir, "best_model")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
model.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
@ -240,10 +205,9 @@ def train(args):
|
|||
logger.info("eval results: {}".format(results))
|
||||
logger.info("best_metirc: {}".format(best_metirc))
|
||||
|
||||
if (paddle.distributed.get_rank() == 0 and args.save_steps > 0 and
|
||||
global_step % args.save_steps == 0):
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-latest")
|
||||
output_dir = os.path.join(args.output_dir, "latest_model")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
model.save_pretrained(output_dir)
|
||||
|
@ -252,6 +216,7 @@ def train(args):
|
|||
os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to {}".format(
|
||||
output_dir))
|
||||
reader_start = time.time()
|
||||
logger.info("best_metirc: {}".format(best_metirc))
|
||||
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ sys.path.append(__dir__)
|
|||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||
|
||||
import random
|
||||
import time
|
||||
import copy
|
||||
import logging
|
||||
|
||||
|
@ -29,19 +30,11 @@ import numpy as np
|
|||
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
from xfun import XFUNDataset
|
||||
from utils import parse_args
|
||||
from utils import get_bio_label_maps
|
||||
from utils import print_arguments
|
||||
|
||||
from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
|
||||
from eval_ser import evaluate
|
||||
from ppocr.utils.logging import get_logger
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
paddle.seed(args.seed)
|
||||
|
||||
|
||||
def train(args):
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
||||
|
@ -55,9 +48,15 @@ def train(args):
|
|||
paddle.distributed.init_parallel_env()
|
||||
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification(
|
||||
base_model, num_classes=len(label2id_map), dropout=None)
|
||||
if not args.resume:
|
||||
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification(
|
||||
model, num_classes=len(label2id_map), dropout=None)
|
||||
logger.info('train from scratch')
|
||||
else:
|
||||
logger.info('resume from {}'.format(args.model_name_or_path))
|
||||
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
|
||||
# dist mode
|
||||
if paddle.distributed.get_world_size() > 1:
|
||||
|
@ -74,6 +73,17 @@ def train(args):
|
|||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
eval_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.eval_data_dir,
|
||||
label_path=args.eval_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
train_sampler = paddle.io.DistributedBatchSampler(
|
||||
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
|
||||
|
@ -88,6 +98,13 @@ def train(args):
|
|||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
eval_dataloader = paddle.io.DataLoader(
|
||||
eval_dataset,
|
||||
batch_size=args.per_gpu_eval_batch_size,
|
||||
num_workers=0,
|
||||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
t_total = len(train_dataloader) * args.num_train_epochs
|
||||
|
||||
# build linear decay with warmup lr sch
|
||||
|
@ -122,28 +139,49 @@ def train(args):
|
|||
|
||||
global_step = 0
|
||||
tr_loss = 0.0
|
||||
set_seed(args)
|
||||
set_seed(ags.seed)
|
||||
best_metrics = None
|
||||
|
||||
train_reader_cost = 0.0
|
||||
train_run_cost = 0.0
|
||||
total_samples = 0
|
||||
reader_start = time.time()
|
||||
|
||||
print_step = 1
|
||||
model.train()
|
||||
for epoch_id in range(args.num_train_epochs):
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
model.train()
|
||||
train_reader_cost += time.time() - reader_start
|
||||
|
||||
train_start = time.time()
|
||||
outputs = model(**batch)
|
||||
train_run_cost += time.time() - train_start
|
||||
|
||||
# model outputs are always tuple in ppnlp (see doc)
|
||||
loss = outputs[0]
|
||||
loss = loss.mean()
|
||||
logger.info(
|
||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
|
||||
format(epoch_id, args.num_train_epochs, step,
|
||||
len(train_dataloader), global_step,
|
||||
loss.numpy()[0], lr_scheduler.get_lr()))
|
||||
|
||||
loss.backward()
|
||||
tr_loss += loss.item()
|
||||
optimizer.step()
|
||||
lr_scheduler.step() # Update learning rate schedule
|
||||
optimizer.clear_grad()
|
||||
global_step += 1
|
||||
total_samples += batch['image'].shape[0]
|
||||
|
||||
if step % print_step == 0:
|
||||
logger.info(
|
||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
|
||||
format(epoch_id, args.num_train_epochs, step,
|
||||
len(train_dataloader), global_step,
|
||||
loss.numpy()[0],
|
||||
lr_scheduler.get_lr(), train_reader_cost /
|
||||
print_step, (train_reader_cost + train_run_cost) /
|
||||
print_step, total_samples / print_step, total_samples
|
||||
/ (train_reader_cost + train_run_cost)))
|
||||
|
||||
train_reader_cost = 0.0
|
||||
train_run_cost = 0.0
|
||||
total_samples = 0
|
||||
|
||||
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
||||
global_step % args.eval_steps == 0):
|
||||
|
@ -151,9 +189,9 @@ def train(args):
|
|||
# Only evaluate when single GPU otherwise metrics may not average well
|
||||
if paddle.distributed.get_rank(
|
||||
) == 0 and args.evaluate_during_training:
|
||||
results, _ = evaluate(args, model, tokenizer, label2id_map,
|
||||
id2label_map, pad_token_label_id,
|
||||
logger)
|
||||
results, _ = evaluate(
|
||||
args, model, tokenizer, eval_dataloader, label2id_map,
|
||||
id2label_map, pad_token_label_id, logger)
|
||||
|
||||
if best_metrics is None or results["f1"] >= best_metrics[
|
||||
"f1"]:
|
||||
|
@ -175,11 +213,9 @@ def train(args):
|
|||
if best_metrics is not None:
|
||||
logger.info("best metrics: {}".format(best_metrics))
|
||||
|
||||
if paddle.distributed.get_rank(
|
||||
) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir,
|
||||
"checkpoint-{}".format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "latest_model")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
model.save_pretrained(output_dir)
|
||||
|
@ -187,112 +223,10 @@ def train(args):
|
|||
paddle.save(args,
|
||||
os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
reader_start = time.time()
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args,
|
||||
model,
|
||||
tokenizer,
|
||||
label2id_map,
|
||||
id2label_map,
|
||||
pad_token_label_id,
|
||||
logger,
|
||||
prefix=""):
|
||||
eval_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.eval_data_dir,
|
||||
label_path=args.eval_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
|
||||
1, paddle.distributed.get_world_size())
|
||||
|
||||
eval_dataloader = paddle.io.DataLoader(
|
||||
eval_dataset,
|
||||
batch_size=args.eval_batch_size,
|
||||
num_workers=0,
|
||||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation %s *****", prefix)
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
preds = None
|
||||
out_label_ids = None
|
||||
model.eval()
|
||||
for idx, batch in enumerate(eval_dataloader):
|
||||
with paddle.no_grad():
|
||||
outputs = model(**batch)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
tmp_eval_loss = tmp_eval_loss.mean()
|
||||
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
|
||||
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
|
||||
|
||||
eval_loss += tmp_eval_loss.item()
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.numpy()
|
||||
out_label_ids = batch["labels"].numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.numpy(), axis=0)
|
||||
out_label_ids = np.append(
|
||||
out_label_ids, batch["labels"].numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
preds = np.argmax(preds, axis=2)
|
||||
|
||||
# label_map = {i: label.upper() for i, label in enumerate(labels)}
|
||||
|
||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
|
||||
for i in range(out_label_ids.shape[0]):
|
||||
for j in range(out_label_ids.shape[1]):
|
||||
if out_label_ids[i, j] != pad_token_label_id:
|
||||
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
|
||||
preds_list[i].append(id2label_map[preds[i][j]])
|
||||
|
||||
results = {
|
||||
"loss": eval_loss,
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
|
||||
for lbl in out_label_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
|
||||
for lbl in preds_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
|
||||
report = classification_report(out_label_list, preds_list)
|
||||
logger.info("\n" + report)
|
||||
|
||||
logger.info("***** Eval results %s *****", prefix)
|
||||
for key in sorted(results.keys()):
|
||||
logger.info(" %s = %s", key, str(results[key]))
|
||||
|
||||
return results, preds_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
train(args)
|
||||
|
|
|
@ -25,6 +25,12 @@ import paddle
|
|||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
|
||||
def set_seed(seed):
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
paddle.seed(seed)
|
||||
|
||||
|
||||
def get_bio_label_maps(label_map_path):
|
||||
with open(label_map_path, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
|
@ -375,8 +381,6 @@ def parse_args():
|
|||
help="Linear warmup over warmup_steps.",)
|
||||
parser.add_argument("--eval_steps", type=int, default=10,
|
||||
help="eval every X updates steps.",)
|
||||
parser.add_argument("--save_steps", type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.",)
|
||||
parser.add_argument("--seed", type=int, default=2048,
|
||||
help="random seed for initialization",)
|
||||
|
||||
|
@ -385,6 +389,7 @@ def parse_args():
|
|||
parser.add_argument(
|
||||
"--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
|
||||
parser.add_argument("--infer_imgs", default=None, type=str, required=False)
|
||||
parser.add_argument("--resume", action='store_true')
|
||||
parser.add_argument("--ocr_json_path", default=None,
|
||||
type=str, required=False, help="ocr prediction results")
|
||||
# yapf: enable
|
||||
|
|
Loading…
Reference in New Issue