mirror of
https://github.com/PaddlePaddle/PaddleOCR.git
synced 2025-06-03 21:53:39 +08:00
add eval and ips (#4947)
* del unused code * add eval * add resume * fix error
This commit is contained in:
parent
8cd341095e
commit
5c6f90d6f7
@ -98,7 +98,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
|
|||||||
# 需要使用PaddleNLP最新的代码版本进行安装
|
# 需要使用PaddleNLP最新的代码版本进行安装
|
||||||
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
|
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
|
||||||
cd PaddleNLP
|
cd PaddleNLP
|
||||||
pip install -e .
|
pip3 install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -141,7 +141,6 @@ python3.7 train_ser.py \
|
|||||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||||
--num_train_epochs 200 \
|
--num_train_epochs 200 \
|
||||||
--eval_steps 10 \
|
--eval_steps 10 \
|
||||||
--save_steps 500 \
|
|
||||||
--output_dir "./output/ser/" \
|
--output_dir "./output/ser/" \
|
||||||
--learning_rate 5e-5 \
|
--learning_rate 5e-5 \
|
||||||
--warmup_steps 50 \
|
--warmup_steps 50 \
|
||||||
@ -151,6 +150,48 @@ python3.7 train_ser.py \
|
|||||||
|
|
||||||
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。
|
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。
|
||||||
|
|
||||||
|
* 恢复训练
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python3.7 train_ser.py \
|
||||||
|
--model_name_or_path "model_path" \
|
||||||
|
--train_data_dir "XFUND/zh_train/image" \
|
||||||
|
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
|
||||||
|
--eval_data_dir "XFUND/zh_val/image" \
|
||||||
|
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||||
|
--num_train_epochs 200 \
|
||||||
|
--eval_steps 10 \
|
||||||
|
--output_dir "./output/ser/" \
|
||||||
|
--learning_rate 5e-5 \
|
||||||
|
--warmup_steps 50 \
|
||||||
|
--evaluate_during_training \
|
||||||
|
--seed 2048 \
|
||||||
|
--resume
|
||||||
|
```
|
||||||
|
|
||||||
|
* 评估
|
||||||
|
```shell
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
python3 eval_ser.py \
|
||||||
|
--model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \
|
||||||
|
--eval_data_dir "XFUND/zh_val/image" \
|
||||||
|
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||||
|
--per_gpu_eval_batch_size 8 \
|
||||||
|
--output_dir "output/ser/" \
|
||||||
|
--seed 2048
|
||||||
|
```
|
||||||
|
最终会打印出`precision`, `recall`, `f1`等指标
|
||||||
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
python3.7 infer_ser.py \
|
||||||
|
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
||||||
|
--output_dir "output_res/" \
|
||||||
|
--infer_imgs "XFUND/zh_val/image/" \
|
||||||
|
--ocr_json_path "XFUND/zh_val/xfun_normalize_val.json"
|
||||||
|
```
|
||||||
|
|
||||||
* 使用评估集合中提供的OCR识别结果进行预测
|
* 使用评估集合中提供的OCR识别结果进行预测
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@ -188,6 +229,7 @@ python3.7 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_nor
|
|||||||
* 启动训练
|
* 启动训练
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
python3 train_re.py \
|
python3 train_re.py \
|
||||||
--model_name_or_path "layoutxlm-base-uncased" \
|
--model_name_or_path "layoutxlm-base-uncased" \
|
||||||
--train_data_dir "XFUND/zh_train/image" \
|
--train_data_dir "XFUND/zh_train/image" \
|
||||||
@ -197,7 +239,6 @@ python3 train_re.py \
|
|||||||
--label_map_path 'labels/labels_ser.txt' \
|
--label_map_path 'labels/labels_ser.txt' \
|
||||||
--num_train_epochs 2 \
|
--num_train_epochs 2 \
|
||||||
--eval_steps 10 \
|
--eval_steps 10 \
|
||||||
--save_steps 500 \
|
|
||||||
--output_dir "output/re/" \
|
--output_dir "output/re/" \
|
||||||
--learning_rate 5e-5 \
|
--learning_rate 5e-5 \
|
||||||
--warmup_steps 50 \
|
--warmup_steps 50 \
|
||||||
@ -208,8 +249,48 @@ python3 train_re.py \
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
* 恢复训练
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
python3 train_re.py \
|
||||||
|
--model_name_or_path "model_path" \
|
||||||
|
--train_data_dir "XFUND/zh_train/image" \
|
||||||
|
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
|
||||||
|
--eval_data_dir "XFUND/zh_val/image" \
|
||||||
|
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||||
|
--label_map_path 'labels/labels_ser.txt' \
|
||||||
|
--num_train_epochs 2 \
|
||||||
|
--eval_steps 10 \
|
||||||
|
--output_dir "output/re/" \
|
||||||
|
--learning_rate 5e-5 \
|
||||||
|
--warmup_steps 50 \
|
||||||
|
--per_gpu_train_batch_size 8 \
|
||||||
|
--per_gpu_eval_batch_size 8 \
|
||||||
|
--evaluate_during_training \
|
||||||
|
--seed 2048 \
|
||||||
|
--resume
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。
|
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。
|
||||||
|
|
||||||
|
* 评估
|
||||||
|
```shell
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
python3 eval_re.py \
|
||||||
|
--model_name_or_path "output/check/checkpoint-best" \
|
||||||
|
--max_seq_length 512 \
|
||||||
|
--eval_data_dir "XFUND/zh_val/image" \
|
||||||
|
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||||
|
--label_map_path 'labels/labels_ser.txt' \
|
||||||
|
--output_dir "output/re_test/" \
|
||||||
|
--per_gpu_eval_batch_size 8 \
|
||||||
|
--seed 2048
|
||||||
|
```
|
||||||
|
最终会打印出`precision`, `recall`, `f1`等指标
|
||||||
|
|
||||||
|
|
||||||
* 使用评估集合中提供的OCR识别结果进行预测
|
* 使用评估集合中提供的OCR识别结果进行预测
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@ -231,7 +312,7 @@ python3 infer_re.py \
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
export CUDA_VISIBLE_DEVICES=0
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
# python3.7 infer_ser_re_e2e.py \
|
python3.7 infer_ser_re_e2e.py \
|
||||||
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
||||||
--re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \
|
--re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \
|
||||||
--max_seq_length 512 \
|
--max_seq_length 512 \
|
||||||
|
125
ppstructure/vqa/eval_re.py
Normal file
125
ppstructure/vqa/eval_re.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
sys.path.append(__dir__)
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
|
||||||
|
|
||||||
|
from xfun import XFUNDataset
|
||||||
|
from utils import parse_args, get_bio_label_maps, print_arguments
|
||||||
|
from data_collator import DataCollator
|
||||||
|
from metric import re_score
|
||||||
|
|
||||||
|
from ppocr.utils.logging import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
def cal_metric(re_preds, re_labels, entities):
|
||||||
|
gt_relations = []
|
||||||
|
for b in range(len(re_labels)):
|
||||||
|
rel_sent = []
|
||||||
|
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
|
||||||
|
rel = {}
|
||||||
|
rel["head_id"] = head
|
||||||
|
rel["head"] = (entities[b]["start"][rel["head_id"]],
|
||||||
|
entities[b]["end"][rel["head_id"]])
|
||||||
|
rel["head_type"] = entities[b]["label"][rel["head_id"]]
|
||||||
|
|
||||||
|
rel["tail_id"] = tail
|
||||||
|
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
|
||||||
|
entities[b]["end"][rel["tail_id"]])
|
||||||
|
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
|
||||||
|
|
||||||
|
rel["type"] = 1
|
||||||
|
rel_sent.append(rel)
|
||||||
|
gt_relations.append(rel_sent)
|
||||||
|
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
|
||||||
|
return re_metrics
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(model, eval_dataloader, logger, prefix=""):
|
||||||
|
# Eval!
|
||||||
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
|
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
|
||||||
|
|
||||||
|
re_preds = []
|
||||||
|
re_labels = []
|
||||||
|
entities = []
|
||||||
|
eval_loss = 0.0
|
||||||
|
model.eval()
|
||||||
|
for idx, batch in enumerate(eval_dataloader):
|
||||||
|
with paddle.no_grad():
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs['loss'].mean().item()
|
||||||
|
if paddle.distributed.get_rank() == 0:
|
||||||
|
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
|
||||||
|
idx, len(eval_dataloader), loss))
|
||||||
|
|
||||||
|
eval_loss += loss
|
||||||
|
re_preds.extend(outputs['pred_relations'])
|
||||||
|
re_labels.extend(batch['relations'])
|
||||||
|
entities.extend(batch['entities'])
|
||||||
|
re_metrics = cal_metric(re_preds, re_labels, entities)
|
||||||
|
re_metrics = {
|
||||||
|
"precision": re_metrics["ALL"]["p"],
|
||||||
|
"recall": re_metrics["ALL"]["r"],
|
||||||
|
"f1": re_metrics["ALL"]["f1"],
|
||||||
|
}
|
||||||
|
model.train()
|
||||||
|
return re_metrics
|
||||||
|
|
||||||
|
|
||||||
|
def eval(args):
|
||||||
|
logger = get_logger()
|
||||||
|
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||||
|
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||||
|
|
||||||
|
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||||
|
|
||||||
|
model = LayoutXLMForRelationExtraction.from_pretrained(
|
||||||
|
args.model_name_or_path)
|
||||||
|
|
||||||
|
eval_dataset = XFUNDataset(
|
||||||
|
tokenizer,
|
||||||
|
data_dir=args.eval_data_dir,
|
||||||
|
label_path=args.eval_label_path,
|
||||||
|
label2id_map=label2id_map,
|
||||||
|
img_size=(224, 224),
|
||||||
|
max_seq_len=args.max_seq_length,
|
||||||
|
pad_token_label_id=pad_token_label_id,
|
||||||
|
contains_re=True,
|
||||||
|
add_special_ids=False,
|
||||||
|
return_attention_mask=True,
|
||||||
|
load_mode='all')
|
||||||
|
|
||||||
|
eval_dataloader = paddle.io.DataLoader(
|
||||||
|
eval_dataset,
|
||||||
|
batch_size=args.per_gpu_eval_batch_size,
|
||||||
|
num_workers=8,
|
||||||
|
shuffle=False,
|
||||||
|
collate_fn=DataCollator())
|
||||||
|
|
||||||
|
results = evaluate(model, eval_dataloader, logger)
|
||||||
|
logger.info("eval results: {}".format(results))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
eval(args)
|
154
ppstructure/vqa/eval_ser.py
Normal file
154
ppstructure/vqa/eval_ser.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
sys.path.append(__dir__)
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||||
|
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
||||||
|
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||||
|
from xfun import XFUNDataset
|
||||||
|
from utils import parse_args, get_bio_label_maps, print_arguments
|
||||||
|
|
||||||
|
from ppocr.utils.logging import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
def eval(args):
|
||||||
|
logger = get_logger()
|
||||||
|
print_arguments(args, logger)
|
||||||
|
|
||||||
|
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||||
|
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||||
|
|
||||||
|
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||||
|
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||||
|
args.model_name_or_path)
|
||||||
|
|
||||||
|
eval_dataset = XFUNDataset(
|
||||||
|
tokenizer,
|
||||||
|
data_dir=args.eval_data_dir,
|
||||||
|
label_path=args.eval_label_path,
|
||||||
|
label2id_map=label2id_map,
|
||||||
|
img_size=(224, 224),
|
||||||
|
pad_token_label_id=pad_token_label_id,
|
||||||
|
contains_re=False,
|
||||||
|
add_special_ids=False,
|
||||||
|
return_attention_mask=True,
|
||||||
|
load_mode='all')
|
||||||
|
|
||||||
|
eval_dataloader = paddle.io.DataLoader(
|
||||||
|
eval_dataset,
|
||||||
|
batch_size=args.per_gpu_eval_batch_size,
|
||||||
|
num_workers=0,
|
||||||
|
use_shared_memory=True,
|
||||||
|
collate_fn=None, )
|
||||||
|
|
||||||
|
results, _ = evaluate(args, model, tokenizer, eval_dataloader, label2id_map,
|
||||||
|
id2label_map, pad_token_label_id, logger)
|
||||||
|
|
||||||
|
logger.info(results)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args,
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
eval_dataloader,
|
||||||
|
label2id_map,
|
||||||
|
id2label_map,
|
||||||
|
pad_token_label_id,
|
||||||
|
logger,
|
||||||
|
prefix=""):
|
||||||
|
|
||||||
|
eval_loss = 0.0
|
||||||
|
nb_eval_steps = 0
|
||||||
|
preds = None
|
||||||
|
out_label_ids = None
|
||||||
|
model.eval()
|
||||||
|
for idx, batch in enumerate(eval_dataloader):
|
||||||
|
with paddle.no_grad():
|
||||||
|
outputs = model(**batch)
|
||||||
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
tmp_eval_loss = tmp_eval_loss.mean()
|
||||||
|
|
||||||
|
if paddle.distributed.get_rank() == 0:
|
||||||
|
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
|
||||||
|
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
|
||||||
|
|
||||||
|
eval_loss += tmp_eval_loss.item()
|
||||||
|
nb_eval_steps += 1
|
||||||
|
if preds is None:
|
||||||
|
preds = logits.numpy()
|
||||||
|
out_label_ids = batch["labels"].numpy()
|
||||||
|
else:
|
||||||
|
preds = np.append(preds, logits.numpy(), axis=0)
|
||||||
|
out_label_ids = np.append(
|
||||||
|
out_label_ids, batch["labels"].numpy(), axis=0)
|
||||||
|
|
||||||
|
eval_loss = eval_loss / nb_eval_steps
|
||||||
|
preds = np.argmax(preds, axis=2)
|
||||||
|
|
||||||
|
# label_map = {i: label.upper() for i, label in enumerate(labels)}
|
||||||
|
|
||||||
|
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||||
|
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||||
|
|
||||||
|
for i in range(out_label_ids.shape[0]):
|
||||||
|
for j in range(out_label_ids.shape[1]):
|
||||||
|
if out_label_ids[i, j] != pad_token_label_id:
|
||||||
|
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
|
||||||
|
preds_list[i].append(id2label_map[preds[i][j]])
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"loss": eval_loss,
|
||||||
|
"precision": precision_score(out_label_list, preds_list),
|
||||||
|
"recall": recall_score(out_label_list, preds_list),
|
||||||
|
"f1": f1_score(out_label_list, preds_list),
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
|
||||||
|
for lbl in out_label_list:
|
||||||
|
for l in lbl:
|
||||||
|
fout.write(l + "\t")
|
||||||
|
fout.write("\n")
|
||||||
|
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
|
||||||
|
for lbl in preds_list:
|
||||||
|
for l in lbl:
|
||||||
|
fout.write(l + "\t")
|
||||||
|
fout.write("\n")
|
||||||
|
|
||||||
|
report = classification_report(out_label_list, preds_list)
|
||||||
|
logger.info("\n" + report)
|
||||||
|
|
||||||
|
logger.info("***** Eval results %s *****", prefix)
|
||||||
|
for key in sorted(results.keys()):
|
||||||
|
logger.info(" %s = %s", key, str(results[key]))
|
||||||
|
model.train()
|
||||||
|
return results, preds_list
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
eval(args)
|
@ -24,9 +24,9 @@ import paddle
|
|||||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||||
|
|
||||||
# relative reference
|
# relative reference
|
||||||
from .utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
|
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
|
||||||
|
|
||||||
from .utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
|
from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
|
||||||
|
|
||||||
|
|
||||||
def trans_poly_to_bbox(poly):
|
def trans_poly_to_bbox(poly):
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
sentencepiece
|
sentencepiece
|
||||||
yacs
|
yacs
|
||||||
|
seqeval
|
@ -20,80 +20,20 @@ sys.path.append(__dir__)
|
|||||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
|
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
|
||||||
|
|
||||||
from xfun import XFUNDataset
|
from xfun import XFUNDataset
|
||||||
from utils import parse_args, get_bio_label_maps, print_arguments
|
from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
|
||||||
from data_collator import DataCollator
|
from data_collator import DataCollator
|
||||||
from metric import re_score
|
from eval_re import evaluate
|
||||||
|
|
||||||
from ppocr.utils.logging import get_logger
|
from ppocr.utils.logging import get_logger
|
||||||
|
|
||||||
|
|
||||||
def set_seed(seed):
|
|
||||||
random.seed(seed)
|
|
||||||
np.random.seed(seed)
|
|
||||||
paddle.seed(seed)
|
|
||||||
|
|
||||||
|
|
||||||
def cal_metric(re_preds, re_labels, entities):
|
|
||||||
gt_relations = []
|
|
||||||
for b in range(len(re_labels)):
|
|
||||||
rel_sent = []
|
|
||||||
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
|
|
||||||
rel = {}
|
|
||||||
rel["head_id"] = head
|
|
||||||
rel["head"] = (entities[b]["start"][rel["head_id"]],
|
|
||||||
entities[b]["end"][rel["head_id"]])
|
|
||||||
rel["head_type"] = entities[b]["label"][rel["head_id"]]
|
|
||||||
|
|
||||||
rel["tail_id"] = tail
|
|
||||||
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
|
|
||||||
entities[b]["end"][rel["tail_id"]])
|
|
||||||
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
|
|
||||||
|
|
||||||
rel["type"] = 1
|
|
||||||
rel_sent.append(rel)
|
|
||||||
gt_relations.append(rel_sent)
|
|
||||||
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
|
|
||||||
return re_metrics
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model, eval_dataloader, logger, prefix=""):
|
|
||||||
# Eval!
|
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
|
||||||
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
|
|
||||||
|
|
||||||
re_preds = []
|
|
||||||
re_labels = []
|
|
||||||
entities = []
|
|
||||||
eval_loss = 0.0
|
|
||||||
model.eval()
|
|
||||||
for idx, batch in enumerate(eval_dataloader):
|
|
||||||
with paddle.no_grad():
|
|
||||||
outputs = model(**batch)
|
|
||||||
loss = outputs['loss'].mean().item()
|
|
||||||
if paddle.distributed.get_rank() == 0:
|
|
||||||
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
|
|
||||||
idx, len(eval_dataloader), loss))
|
|
||||||
|
|
||||||
eval_loss += loss
|
|
||||||
re_preds.extend(outputs['pred_relations'])
|
|
||||||
re_labels.extend(batch['relations'])
|
|
||||||
entities.extend(batch['entities'])
|
|
||||||
re_metrics = cal_metric(re_preds, re_labels, entities)
|
|
||||||
re_metrics = {
|
|
||||||
"precision": re_metrics["ALL"]["p"],
|
|
||||||
"recall": re_metrics["ALL"]["r"],
|
|
||||||
"f1": re_metrics["ALL"]["f1"],
|
|
||||||
}
|
|
||||||
model.train()
|
|
||||||
return re_metrics
|
|
||||||
|
|
||||||
|
|
||||||
def train(args):
|
def train(args):
|
||||||
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
||||||
print_arguments(args, logger)
|
print_arguments(args, logger)
|
||||||
@ -109,9 +49,14 @@ def train(args):
|
|||||||
paddle.distributed.init_parallel_env()
|
paddle.distributed.init_parallel_env()
|
||||||
|
|
||||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||||
|
if not args.resume:
|
||||||
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||||
model = LayoutXLMForRelationExtraction(model, dropout=None)
|
model = LayoutXLMForRelationExtraction(model, dropout=None)
|
||||||
|
logger.info('train from scratch')
|
||||||
|
else:
|
||||||
|
logger.info('resume from {}'.format(args.model_name_or_path))
|
||||||
|
model = LayoutXLMForRelationExtraction.from_pretrained(
|
||||||
|
args.model_name_or_path)
|
||||||
|
|
||||||
# dist mode
|
# dist mode
|
||||||
if paddle.distributed.get_world_size() > 1:
|
if paddle.distributed.get_world_size() > 1:
|
||||||
@ -200,24 +145,45 @@ def train(args):
|
|||||||
best_metirc = {'f1': 0}
|
best_metirc = {'f1': 0}
|
||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
|
train_reader_cost = 0.0
|
||||||
|
train_run_cost = 0.0
|
||||||
|
total_samples = 0
|
||||||
|
reader_start = time.time()
|
||||||
|
|
||||||
|
print_step = 1
|
||||||
|
|
||||||
for epoch in range(int(args.num_train_epochs)):
|
for epoch in range(int(args.num_train_epochs)):
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
train_reader_cost += time.time() - reader_start
|
||||||
|
train_start = time.time()
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
|
train_run_cost += time.time() - train_start
|
||||||
# model outputs are always tuple in ppnlp (see doc)
|
# model outputs are always tuple in ppnlp (see doc)
|
||||||
loss = outputs['loss']
|
loss = outputs['loss']
|
||||||
loss = loss.mean()
|
loss = loss.mean()
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
|
|
||||||
format(epoch, args.num_train_epochs, step, train_dataloader_len,
|
|
||||||
global_step, np.mean(loss.numpy()), optimizer.get_lr()))
|
|
||||||
|
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
optimizer.clear_grad()
|
optimizer.clear_grad()
|
||||||
# lr_scheduler.step() # Update learning rate schedule
|
# lr_scheduler.step() # Update learning rate schedule
|
||||||
|
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
total_samples += batch['image'].shape[0]
|
||||||
|
|
||||||
|
if step % print_step == 0:
|
||||||
|
logger.info(
|
||||||
|
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
|
||||||
|
format(epoch, args.num_train_epochs, step,
|
||||||
|
train_dataloader_len, global_step,
|
||||||
|
np.mean(loss.numpy()),
|
||||||
|
optimizer.get_lr(), train_reader_cost / print_step, (
|
||||||
|
train_reader_cost + train_run_cost) / print_step,
|
||||||
|
total_samples / print_step, total_samples / (
|
||||||
|
train_reader_cost + train_run_cost)))
|
||||||
|
|
||||||
|
train_reader_cost = 0.0
|
||||||
|
train_run_cost = 0.0
|
||||||
|
total_samples = 0
|
||||||
|
|
||||||
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
||||||
global_step % args.eval_steps == 0):
|
global_step % args.eval_steps == 0):
|
||||||
@ -225,10 +191,9 @@ def train(args):
|
|||||||
if (paddle.distributed.get_rank() == 0 and args.
|
if (paddle.distributed.get_rank() == 0 and args.
|
||||||
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well
|
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
results = evaluate(model, eval_dataloader, logger)
|
results = evaluate(model, eval_dataloader, logger)
|
||||||
if results['f1'] > best_metirc['f1']:
|
if results['f1'] >= best_metirc['f1']:
|
||||||
best_metirc = results
|
best_metirc = results
|
||||||
output_dir = os.path.join(args.output_dir,
|
output_dir = os.path.join(args.output_dir, "best_model")
|
||||||
"checkpoint-best")
|
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
model.save_pretrained(output_dir)
|
model.save_pretrained(output_dir)
|
||||||
tokenizer.save_pretrained(output_dir)
|
tokenizer.save_pretrained(output_dir)
|
||||||
@ -240,10 +205,9 @@ def train(args):
|
|||||||
logger.info("eval results: {}".format(results))
|
logger.info("eval results: {}".format(results))
|
||||||
logger.info("best_metirc: {}".format(best_metirc))
|
logger.info("best_metirc: {}".format(best_metirc))
|
||||||
|
|
||||||
if (paddle.distributed.get_rank() == 0 and args.save_steps > 0 and
|
if paddle.distributed.get_rank() == 0:
|
||||||
global_step % args.save_steps == 0):
|
|
||||||
# Save model checkpoint
|
# Save model checkpoint
|
||||||
output_dir = os.path.join(args.output_dir, "checkpoint-latest")
|
output_dir = os.path.join(args.output_dir, "latest_model")
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
if paddle.distributed.get_rank() == 0:
|
if paddle.distributed.get_rank() == 0:
|
||||||
model.save_pretrained(output_dir)
|
model.save_pretrained(output_dir)
|
||||||
@ -252,6 +216,7 @@ def train(args):
|
|||||||
os.path.join(output_dir, "training_args.bin"))
|
os.path.join(output_dir, "training_args.bin"))
|
||||||
logger.info("Saving model checkpoint to {}".format(
|
logger.info("Saving model checkpoint to {}".format(
|
||||||
output_dir))
|
output_dir))
|
||||||
|
reader_start = time.time()
|
||||||
logger.info("best_metirc: {}".format(best_metirc))
|
logger.info("best_metirc: {}".format(best_metirc))
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ sys.path.append(__dir__)
|
|||||||
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@ -29,19 +30,11 @@ import numpy as np
|
|||||||
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
||||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||||
from xfun import XFUNDataset
|
from xfun import XFUNDataset
|
||||||
from utils import parse_args
|
from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
|
||||||
from utils import get_bio_label_maps
|
from eval_ser import evaluate
|
||||||
from utils import print_arguments
|
|
||||||
|
|
||||||
from ppocr.utils.logging import get_logger
|
from ppocr.utils.logging import get_logger
|
||||||
|
|
||||||
|
|
||||||
def set_seed(args):
|
|
||||||
random.seed(args.seed)
|
|
||||||
np.random.seed(args.seed)
|
|
||||||
paddle.seed(args.seed)
|
|
||||||
|
|
||||||
|
|
||||||
def train(args):
|
def train(args):
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
|
||||||
@ -55,9 +48,15 @@ def train(args):
|
|||||||
paddle.distributed.init_parallel_env()
|
paddle.distributed.init_parallel_env()
|
||||||
|
|
||||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||||
base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
if not args.resume:
|
||||||
model = LayoutXLMForTokenClassification(
|
model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||||
base_model, num_classes=len(label2id_map), dropout=None)
|
model = LayoutXLMForTokenClassification(
|
||||||
|
model, num_classes=len(label2id_map), dropout=None)
|
||||||
|
logger.info('train from scratch')
|
||||||
|
else:
|
||||||
|
logger.info('resume from {}'.format(args.model_name_or_path))
|
||||||
|
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||||
|
args.model_name_or_path)
|
||||||
|
|
||||||
# dist mode
|
# dist mode
|
||||||
if paddle.distributed.get_world_size() > 1:
|
if paddle.distributed.get_world_size() > 1:
|
||||||
@ -74,6 +73,17 @@ def train(args):
|
|||||||
add_special_ids=False,
|
add_special_ids=False,
|
||||||
return_attention_mask=True,
|
return_attention_mask=True,
|
||||||
load_mode='all')
|
load_mode='all')
|
||||||
|
eval_dataset = XFUNDataset(
|
||||||
|
tokenizer,
|
||||||
|
data_dir=args.eval_data_dir,
|
||||||
|
label_path=args.eval_label_path,
|
||||||
|
label2id_map=label2id_map,
|
||||||
|
img_size=(224, 224),
|
||||||
|
pad_token_label_id=pad_token_label_id,
|
||||||
|
contains_re=False,
|
||||||
|
add_special_ids=False,
|
||||||
|
return_attention_mask=True,
|
||||||
|
load_mode='all')
|
||||||
|
|
||||||
train_sampler = paddle.io.DistributedBatchSampler(
|
train_sampler = paddle.io.DistributedBatchSampler(
|
||||||
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
|
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
|
||||||
@ -88,6 +98,13 @@ def train(args):
|
|||||||
use_shared_memory=True,
|
use_shared_memory=True,
|
||||||
collate_fn=None, )
|
collate_fn=None, )
|
||||||
|
|
||||||
|
eval_dataloader = paddle.io.DataLoader(
|
||||||
|
eval_dataset,
|
||||||
|
batch_size=args.per_gpu_eval_batch_size,
|
||||||
|
num_workers=0,
|
||||||
|
use_shared_memory=True,
|
||||||
|
collate_fn=None, )
|
||||||
|
|
||||||
t_total = len(train_dataloader) * args.num_train_epochs
|
t_total = len(train_dataloader) * args.num_train_epochs
|
||||||
|
|
||||||
# build linear decay with warmup lr sch
|
# build linear decay with warmup lr sch
|
||||||
@ -122,28 +139,49 @@ def train(args):
|
|||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
tr_loss = 0.0
|
tr_loss = 0.0
|
||||||
set_seed(args)
|
set_seed(ags.seed)
|
||||||
best_metrics = None
|
best_metrics = None
|
||||||
|
|
||||||
|
train_reader_cost = 0.0
|
||||||
|
train_run_cost = 0.0
|
||||||
|
total_samples = 0
|
||||||
|
reader_start = time.time()
|
||||||
|
|
||||||
|
print_step = 1
|
||||||
|
model.train()
|
||||||
for epoch_id in range(args.num_train_epochs):
|
for epoch_id in range(args.num_train_epochs):
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
model.train()
|
train_reader_cost += time.time() - reader_start
|
||||||
|
|
||||||
|
train_start = time.time()
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
|
train_run_cost += time.time() - train_start
|
||||||
|
|
||||||
# model outputs are always tuple in ppnlp (see doc)
|
# model outputs are always tuple in ppnlp (see doc)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
loss = loss.mean()
|
loss = loss.mean()
|
||||||
logger.info(
|
|
||||||
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
|
|
||||||
format(epoch_id, args.num_train_epochs, step,
|
|
||||||
len(train_dataloader), global_step,
|
|
||||||
loss.numpy()[0], lr_scheduler.get_lr()))
|
|
||||||
|
|
||||||
loss.backward()
|
loss.backward()
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
lr_scheduler.step() # Update learning rate schedule
|
lr_scheduler.step() # Update learning rate schedule
|
||||||
optimizer.clear_grad()
|
optimizer.clear_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
total_samples += batch['image'].shape[0]
|
||||||
|
|
||||||
|
if step % print_step == 0:
|
||||||
|
logger.info(
|
||||||
|
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
|
||||||
|
format(epoch_id, args.num_train_epochs, step,
|
||||||
|
len(train_dataloader), global_step,
|
||||||
|
loss.numpy()[0],
|
||||||
|
lr_scheduler.get_lr(), train_reader_cost /
|
||||||
|
print_step, (train_reader_cost + train_run_cost) /
|
||||||
|
print_step, total_samples / print_step, total_samples
|
||||||
|
/ (train_reader_cost + train_run_cost)))
|
||||||
|
|
||||||
|
train_reader_cost = 0.0
|
||||||
|
train_run_cost = 0.0
|
||||||
|
total_samples = 0
|
||||||
|
|
||||||
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
||||||
global_step % args.eval_steps == 0):
|
global_step % args.eval_steps == 0):
|
||||||
@ -151,9 +189,9 @@ def train(args):
|
|||||||
# Only evaluate when single GPU otherwise metrics may not average well
|
# Only evaluate when single GPU otherwise metrics may not average well
|
||||||
if paddle.distributed.get_rank(
|
if paddle.distributed.get_rank(
|
||||||
) == 0 and args.evaluate_during_training:
|
) == 0 and args.evaluate_during_training:
|
||||||
results, _ = evaluate(args, model, tokenizer, label2id_map,
|
results, _ = evaluate(
|
||||||
id2label_map, pad_token_label_id,
|
args, model, tokenizer, eval_dataloader, label2id_map,
|
||||||
logger)
|
id2label_map, pad_token_label_id, logger)
|
||||||
|
|
||||||
if best_metrics is None or results["f1"] >= best_metrics[
|
if best_metrics is None or results["f1"] >= best_metrics[
|
||||||
"f1"]:
|
"f1"]:
|
||||||
@ -175,11 +213,9 @@ def train(args):
|
|||||||
if best_metrics is not None:
|
if best_metrics is not None:
|
||||||
logger.info("best metrics: {}".format(best_metrics))
|
logger.info("best metrics: {}".format(best_metrics))
|
||||||
|
|
||||||
if paddle.distributed.get_rank(
|
if paddle.distributed.get_rank() == 0:
|
||||||
) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0:
|
|
||||||
# Save model checkpoint
|
# Save model checkpoint
|
||||||
output_dir = os.path.join(args.output_dir,
|
output_dir = os.path.join(args.output_dir, "latest_model")
|
||||||
"checkpoint-{}".format(global_step))
|
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
if paddle.distributed.get_rank() == 0:
|
if paddle.distributed.get_rank() == 0:
|
||||||
model.save_pretrained(output_dir)
|
model.save_pretrained(output_dir)
|
||||||
@ -187,112 +223,10 @@ def train(args):
|
|||||||
paddle.save(args,
|
paddle.save(args,
|
||||||
os.path.join(output_dir, "training_args.bin"))
|
os.path.join(output_dir, "training_args.bin"))
|
||||||
logger.info("Saving model checkpoint to %s", output_dir)
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
reader_start = time.time()
|
||||||
return global_step, tr_loss / global_step
|
return global_step, tr_loss / global_step
|
||||||
|
|
||||||
|
|
||||||
def evaluate(args,
|
|
||||||
model,
|
|
||||||
tokenizer,
|
|
||||||
label2id_map,
|
|
||||||
id2label_map,
|
|
||||||
pad_token_label_id,
|
|
||||||
logger,
|
|
||||||
prefix=""):
|
|
||||||
eval_dataset = XFUNDataset(
|
|
||||||
tokenizer,
|
|
||||||
data_dir=args.eval_data_dir,
|
|
||||||
label_path=args.eval_label_path,
|
|
||||||
label2id_map=label2id_map,
|
|
||||||
img_size=(224, 224),
|
|
||||||
pad_token_label_id=pad_token_label_id,
|
|
||||||
contains_re=False,
|
|
||||||
add_special_ids=False,
|
|
||||||
return_attention_mask=True,
|
|
||||||
load_mode='all')
|
|
||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
|
|
||||||
1, paddle.distributed.get_world_size())
|
|
||||||
|
|
||||||
eval_dataloader = paddle.io.DataLoader(
|
|
||||||
eval_dataset,
|
|
||||||
batch_size=args.eval_batch_size,
|
|
||||||
num_workers=0,
|
|
||||||
use_shared_memory=True,
|
|
||||||
collate_fn=None, )
|
|
||||||
|
|
||||||
# Eval!
|
|
||||||
logger.info("***** Running evaluation %s *****", prefix)
|
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
|
||||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
|
||||||
eval_loss = 0.0
|
|
||||||
nb_eval_steps = 0
|
|
||||||
preds = None
|
|
||||||
out_label_ids = None
|
|
||||||
model.eval()
|
|
||||||
for idx, batch in enumerate(eval_dataloader):
|
|
||||||
with paddle.no_grad():
|
|
||||||
outputs = model(**batch)
|
|
||||||
tmp_eval_loss, logits = outputs[:2]
|
|
||||||
|
|
||||||
tmp_eval_loss = tmp_eval_loss.mean()
|
|
||||||
|
|
||||||
if paddle.distributed.get_rank() == 0:
|
|
||||||
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
|
|
||||||
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
|
|
||||||
|
|
||||||
eval_loss += tmp_eval_loss.item()
|
|
||||||
nb_eval_steps += 1
|
|
||||||
if preds is None:
|
|
||||||
preds = logits.numpy()
|
|
||||||
out_label_ids = batch["labels"].numpy()
|
|
||||||
else:
|
|
||||||
preds = np.append(preds, logits.numpy(), axis=0)
|
|
||||||
out_label_ids = np.append(
|
|
||||||
out_label_ids, batch["labels"].numpy(), axis=0)
|
|
||||||
|
|
||||||
eval_loss = eval_loss / nb_eval_steps
|
|
||||||
preds = np.argmax(preds, axis=2)
|
|
||||||
|
|
||||||
# label_map = {i: label.upper() for i, label in enumerate(labels)}
|
|
||||||
|
|
||||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
|
||||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
|
||||||
|
|
||||||
for i in range(out_label_ids.shape[0]):
|
|
||||||
for j in range(out_label_ids.shape[1]):
|
|
||||||
if out_label_ids[i, j] != pad_token_label_id:
|
|
||||||
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
|
|
||||||
preds_list[i].append(id2label_map[preds[i][j]])
|
|
||||||
|
|
||||||
results = {
|
|
||||||
"loss": eval_loss,
|
|
||||||
"precision": precision_score(out_label_list, preds_list),
|
|
||||||
"recall": recall_score(out_label_list, preds_list),
|
|
||||||
"f1": f1_score(out_label_list, preds_list),
|
|
||||||
}
|
|
||||||
|
|
||||||
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
|
|
||||||
for lbl in out_label_list:
|
|
||||||
for l in lbl:
|
|
||||||
fout.write(l + "\t")
|
|
||||||
fout.write("\n")
|
|
||||||
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
|
|
||||||
for lbl in preds_list:
|
|
||||||
for l in lbl:
|
|
||||||
fout.write(l + "\t")
|
|
||||||
fout.write("\n")
|
|
||||||
|
|
||||||
report = classification_report(out_label_list, preds_list)
|
|
||||||
logger.info("\n" + report)
|
|
||||||
|
|
||||||
logger.info("***** Eval results %s *****", prefix)
|
|
||||||
for key in sorted(results.keys()):
|
|
||||||
logger.info(" %s = %s", key, str(results[key]))
|
|
||||||
|
|
||||||
return results, preds_list
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
train(args)
|
train(args)
|
||||||
|
@ -25,6 +25,12 @@ import paddle
|
|||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
|
||||||
|
def set_seed(seed):
|
||||||
|
random.seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
paddle.seed(seed)
|
||||||
|
|
||||||
|
|
||||||
def get_bio_label_maps(label_map_path):
|
def get_bio_label_maps(label_map_path):
|
||||||
with open(label_map_path, "r") as fin:
|
with open(label_map_path, "r") as fin:
|
||||||
lines = fin.readlines()
|
lines = fin.readlines()
|
||||||
@ -375,8 +381,6 @@ def parse_args():
|
|||||||
help="Linear warmup over warmup_steps.",)
|
help="Linear warmup over warmup_steps.",)
|
||||||
parser.add_argument("--eval_steps", type=int, default=10,
|
parser.add_argument("--eval_steps", type=int, default=10,
|
||||||
help="eval every X updates steps.",)
|
help="eval every X updates steps.",)
|
||||||
parser.add_argument("--save_steps", type=int, default=50,
|
|
||||||
help="Save checkpoint every X updates steps.",)
|
|
||||||
parser.add_argument("--seed", type=int, default=2048,
|
parser.add_argument("--seed", type=int, default=2048,
|
||||||
help="random seed for initialization",)
|
help="random seed for initialization",)
|
||||||
|
|
||||||
@ -385,6 +389,7 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
|
"--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
|
||||||
parser.add_argument("--infer_imgs", default=None, type=str, required=False)
|
parser.add_argument("--infer_imgs", default=None, type=str, required=False)
|
||||||
|
parser.add_argument("--resume", action='store_true')
|
||||||
parser.add_argument("--ocr_json_path", default=None,
|
parser.add_argument("--ocr_json_path", default=None,
|
||||||
type=str, required=False, help="ocr prediction results")
|
type=str, required=False, help="ocr prediction results")
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
|
Loading…
x
Reference in New Issue
Block a user