add eval and ips (#4947)

* del unused code

* add eval

* add resume

* fix error
This commit is contained in:
zhoujun 2021-12-16 21:51:24 -06:00 committed by GitHub
parent 8cd341095e
commit 5c6f90d6f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 481 additions and 216 deletions

View File

@ -98,7 +98,7 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR
# 需要使用PaddleNLP最新的代码版本进行安装 # 需要使用PaddleNLP最新的代码版本进行安装
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
cd PaddleNLP cd PaddleNLP
pip install -e . pip3 install -e .
``` ```
@ -141,7 +141,6 @@ python3.7 train_ser.py \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \ --eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--num_train_epochs 200 \ --num_train_epochs 200 \
--eval_steps 10 \ --eval_steps 10 \
--save_steps 500 \
--output_dir "./output/ser/" \ --output_dir "./output/ser/" \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--warmup_steps 50 \ --warmup_steps 50 \
@ -151,6 +150,48 @@ python3.7 train_ser.py \
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。 最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/ser/`文件夹中。
* 恢复训练
```shell
python3.7 train_ser.py \
--model_name_or_path "model_path" \
--train_data_dir "XFUND/zh_train/image" \
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
--eval_data_dir "XFUND/zh_val/image" \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--num_train_epochs 200 \
--eval_steps 10 \
--output_dir "./output/ser/" \
--learning_rate 5e-5 \
--warmup_steps 50 \
--evaluate_during_training \
--seed 2048 \
--resume
```
* 评估
```shell
export CUDA_VISIBLE_DEVICES=0
python3 eval_ser.py \
--model_name_or_path "PP-Layout_v1.0_ser_pretrained/" \
--eval_data_dir "XFUND/zh_val/image" \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--per_gpu_eval_batch_size 8 \
--output_dir "output/ser/" \
--seed 2048
```
最终会打印出`precision`, `recall`, `f1`等指标
```shell
export CUDA_VISIBLE_DEVICES=0
python3.7 infer_ser.py \
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
--output_dir "output_res/" \
--infer_imgs "XFUND/zh_val/image/" \
--ocr_json_path "XFUND/zh_val/xfun_normalize_val.json"
```
* 使用评估集合中提供的OCR识别结果进行预测 * 使用评估集合中提供的OCR识别结果进行预测
```shell ```shell
@ -188,6 +229,7 @@ python3.7 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_nor
* 启动训练 * 启动训练
```shell ```shell
export CUDA_VISIBLE_DEVICES=0
python3 train_re.py \ python3 train_re.py \
--model_name_or_path "layoutxlm-base-uncased" \ --model_name_or_path "layoutxlm-base-uncased" \
--train_data_dir "XFUND/zh_train/image" \ --train_data_dir "XFUND/zh_train/image" \
@ -197,7 +239,6 @@ python3 train_re.py \
--label_map_path 'labels/labels_ser.txt' \ --label_map_path 'labels/labels_ser.txt' \
--num_train_epochs 2 \ --num_train_epochs 2 \
--eval_steps 10 \ --eval_steps 10 \
--save_steps 500 \
--output_dir "output/re/" \ --output_dir "output/re/" \
--learning_rate 5e-5 \ --learning_rate 5e-5 \
--warmup_steps 50 \ --warmup_steps 50 \
@ -208,8 +249,48 @@ python3 train_re.py \
``` ```
* 恢复训练
```shell
export CUDA_VISIBLE_DEVICES=0
python3 train_re.py \
--model_name_or_path "model_path" \
--train_data_dir "XFUND/zh_train/image" \
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
--eval_data_dir "XFUND/zh_val/image" \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--label_map_path 'labels/labels_ser.txt' \
--num_train_epochs 2 \
--eval_steps 10 \
--output_dir "output/re/" \
--learning_rate 5e-5 \
--warmup_steps 50 \
--per_gpu_train_batch_size 8 \
--per_gpu_eval_batch_size 8 \
--evaluate_during_training \
--seed 2048 \
--resume
```
最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。 最终会打印出`precision`, `recall`, `f1`等指标,模型和训练日志会保存在`./output/re/`文件夹中。
* 评估
```shell
export CUDA_VISIBLE_DEVICES=0
python3 eval_re.py \
--model_name_or_path "output/check/checkpoint-best" \
--max_seq_length 512 \
--eval_data_dir "XFUND/zh_val/image" \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--label_map_path 'labels/labels_ser.txt' \
--output_dir "output/re_test/" \
--per_gpu_eval_batch_size 8 \
--seed 2048
```
最终会打印出`precision`, `recall`, `f1`等指标
* 使用评估集合中提供的OCR识别结果进行预测 * 使用评估集合中提供的OCR识别结果进行预测
```shell ```shell
@ -231,7 +312,7 @@ python3 infer_re.py \
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
# python3.7 infer_ser_re_e2e.py \ python3.7 infer_ser_re_e2e.py \
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \ --model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
--re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \ --re_model_name_or_path "./PP-Layout_v1.0_re_pretrained/" \
--max_seq_length 512 \ --max_seq_length 512 \

125
ppstructure/vqa/eval_re.py Normal file
View File

@ -0,0 +1,125 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import paddle
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
from xfun import XFUNDataset
from utils import parse_args, get_bio_label_maps, print_arguments
from data_collator import DataCollator
from metric import re_score
from ppocr.utils.logging import get_logger
def cal_metric(re_preds, re_labels, entities):
gt_relations = []
for b in range(len(re_labels)):
rel_sent = []
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
rel = {}
rel["head_id"] = head
rel["head"] = (entities[b]["start"][rel["head_id"]],
entities[b]["end"][rel["head_id"]])
rel["head_type"] = entities[b]["label"][rel["head_id"]]
rel["tail_id"] = tail
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
entities[b]["end"][rel["tail_id"]])
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
rel["type"] = 1
rel_sent.append(rel)
gt_relations.append(rel_sent)
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
return re_metrics
def evaluate(model, eval_dataloader, logger, prefix=""):
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
re_preds = []
re_labels = []
entities = []
eval_loss = 0.0
model.eval()
for idx, batch in enumerate(eval_dataloader):
with paddle.no_grad():
outputs = model(**batch)
loss = outputs['loss'].mean().item()
if paddle.distributed.get_rank() == 0:
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
idx, len(eval_dataloader), loss))
eval_loss += loss
re_preds.extend(outputs['pred_relations'])
re_labels.extend(batch['relations'])
entities.extend(batch['entities'])
re_metrics = cal_metric(re_preds, re_labels, entities)
re_metrics = {
"precision": re_metrics["ALL"]["p"],
"recall": re_metrics["ALL"]["r"],
"f1": re_metrics["ALL"]["f1"],
}
model.train()
return re_metrics
def eval(args):
logger = get_logger()
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
model = LayoutXLMForRelationExtraction.from_pretrained(
args.model_name_or_path)
eval_dataset = XFUNDataset(
tokenizer,
data_dir=args.eval_data_dir,
label_path=args.eval_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
max_seq_len=args.max_seq_length,
pad_token_label_id=pad_token_label_id,
contains_re=True,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
eval_dataloader = paddle.io.DataLoader(
eval_dataset,
batch_size=args.per_gpu_eval_batch_size,
num_workers=8,
shuffle=False,
collate_fn=DataCollator())
results = evaluate(model, eval_dataloader, logger)
logger.info("eval results: {}".format(results))
if __name__ == "__main__":
args = parse_args()
eval(args)

154
ppstructure/vqa/eval_ser.py Normal file
View File

@ -0,0 +1,154 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import random
import time
import copy
import logging
import argparse
import paddle
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
from xfun import XFUNDataset
from utils import parse_args, get_bio_label_maps, print_arguments
from ppocr.utils.logging import get_logger
def eval(args):
logger = get_logger()
print_arguments(args, logger)
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
model = LayoutXLMForTokenClassification.from_pretrained(
args.model_name_or_path)
eval_dataset = XFUNDataset(
tokenizer,
data_dir=args.eval_data_dir,
label_path=args.eval_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
pad_token_label_id=pad_token_label_id,
contains_re=False,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
eval_dataloader = paddle.io.DataLoader(
eval_dataset,
batch_size=args.per_gpu_eval_batch_size,
num_workers=0,
use_shared_memory=True,
collate_fn=None, )
results, _ = evaluate(args, model, tokenizer, eval_dataloader, label2id_map,
id2label_map, pad_token_label_id, logger)
logger.info(results)
def evaluate(args,
model,
tokenizer,
eval_dataloader,
label2id_map,
id2label_map,
pad_token_label_id,
logger,
prefix=""):
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
model.eval()
for idx, batch in enumerate(eval_dataloader):
with paddle.no_grad():
outputs = model(**batch)
tmp_eval_loss, logits = outputs[:2]
tmp_eval_loss = tmp_eval_loss.mean()
if paddle.distributed.get_rank() == 0:
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
eval_loss += tmp_eval_loss.item()
nb_eval_steps += 1
if preds is None:
preds = logits.numpy()
out_label_ids = batch["labels"].numpy()
else:
preds = np.append(preds, logits.numpy(), axis=0)
out_label_ids = np.append(
out_label_ids, batch["labels"].numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=2)
# label_map = {i: label.upper() for i, label in enumerate(labels)}
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]
for i in range(out_label_ids.shape[0]):
for j in range(out_label_ids.shape[1]):
if out_label_ids[i, j] != pad_token_label_id:
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
preds_list[i].append(id2label_map[preds[i][j]])
results = {
"loss": eval_loss,
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
}
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
for lbl in out_label_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
for lbl in preds_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
report = classification_report(out_label_list, preds_list)
logger.info("\n" + report)
logger.info("***** Eval results %s *****", prefix)
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
model.train()
return results, preds_list
if __name__ == "__main__":
args = parse_args()
eval(args)

View File

@ -24,9 +24,9 @@ import paddle
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
# relative reference # relative reference
from .utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
from .utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
def trans_poly_to_bbox(poly): def trans_poly_to_bbox(poly):

View File

@ -1,2 +1,3 @@
sentencepiece sentencepiece
yacs yacs
seqeval

View File

@ -20,80 +20,20 @@ sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import random import random
import time
import numpy as np import numpy as np
import paddle import paddle
from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction from paddlenlp.transformers import LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForRelationExtraction
from xfun import XFUNDataset from xfun import XFUNDataset
from utils import parse_args, get_bio_label_maps, print_arguments from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
from data_collator import DataCollator from data_collator import DataCollator
from metric import re_score from eval_re import evaluate
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def cal_metric(re_preds, re_labels, entities):
gt_relations = []
for b in range(len(re_labels)):
rel_sent = []
for head, tail in zip(re_labels[b]["head"], re_labels[b]["tail"]):
rel = {}
rel["head_id"] = head
rel["head"] = (entities[b]["start"][rel["head_id"]],
entities[b]["end"][rel["head_id"]])
rel["head_type"] = entities[b]["label"][rel["head_id"]]
rel["tail_id"] = tail
rel["tail"] = (entities[b]["start"][rel["tail_id"]],
entities[b]["end"][rel["tail_id"]])
rel["tail_type"] = entities[b]["label"][rel["tail_id"]]
rel["type"] = 1
rel_sent.append(rel)
gt_relations.append(rel_sent)
re_metrics = re_score(re_preds, gt_relations, mode="boundaries")
return re_metrics
def evaluate(model, eval_dataloader, logger, prefix=""):
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = {}".format(len(eval_dataloader.dataset)))
re_preds = []
re_labels = []
entities = []
eval_loss = 0.0
model.eval()
for idx, batch in enumerate(eval_dataloader):
with paddle.no_grad():
outputs = model(**batch)
loss = outputs['loss'].mean().item()
if paddle.distributed.get_rank() == 0:
logger.info("[Eval] process: {}/{}, loss: {:.5f}".format(
idx, len(eval_dataloader), loss))
eval_loss += loss
re_preds.extend(outputs['pred_relations'])
re_labels.extend(batch['relations'])
entities.extend(batch['entities'])
re_metrics = cal_metric(re_preds, re_labels, entities)
re_metrics = {
"precision": re_metrics["ALL"]["p"],
"recall": re_metrics["ALL"]["r"],
"f1": re_metrics["ALL"]["f1"],
}
model.train()
return re_metrics
def train(args): def train(args):
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
print_arguments(args, logger) print_arguments(args, logger)
@ -109,9 +49,14 @@ def train(args):
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
if not args.resume:
model = LayoutXLMModel.from_pretrained(args.model_name_or_path) model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
model = LayoutXLMForRelationExtraction(model, dropout=None) model = LayoutXLMForRelationExtraction(model, dropout=None)
logger.info('train from scratch')
else:
logger.info('resume from {}'.format(args.model_name_or_path))
model = LayoutXLMForRelationExtraction.from_pretrained(
args.model_name_or_path)
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
@ -200,24 +145,45 @@ def train(args):
best_metirc = {'f1': 0} best_metirc = {'f1': 0}
model.train() model.train()
train_reader_cost = 0.0
train_run_cost = 0.0
total_samples = 0
reader_start = time.time()
print_step = 1
for epoch in range(int(args.num_train_epochs)): for epoch in range(int(args.num_train_epochs)):
for step, batch in enumerate(train_dataloader): for step, batch in enumerate(train_dataloader):
train_reader_cost += time.time() - reader_start
train_start = time.time()
outputs = model(**batch) outputs = model(**batch)
train_run_cost += time.time() - train_start
# model outputs are always tuple in ppnlp (see doc) # model outputs are always tuple in ppnlp (see doc)
loss = outputs['loss'] loss = outputs['loss']
loss = loss.mean() loss = loss.mean()
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
format(epoch, args.num_train_epochs, step, train_dataloader_len,
global_step, np.mean(loss.numpy()), optimizer.get_lr()))
loss.backward() loss.backward()
optimizer.step() optimizer.step()
optimizer.clear_grad() optimizer.clear_grad()
# lr_scheduler.step() # Update learning rate schedule # lr_scheduler.step() # Update learning rate schedule
global_step += 1 global_step += 1
total_samples += batch['image'].shape[0]
if step % print_step == 0:
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch, args.num_train_epochs, step,
train_dataloader_len, global_step,
np.mean(loss.numpy()),
optimizer.get_lr(), train_reader_cost / print_step, (
train_reader_cost + train_run_cost) / print_step,
total_samples / print_step, total_samples / (
train_reader_cost + train_run_cost)))
train_reader_cost = 0.0
train_run_cost = 0.0
total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
global_step % args.eval_steps == 0): global_step % args.eval_steps == 0):
@ -225,10 +191,9 @@ def train(args):
if (paddle.distributed.get_rank() == 0 and args. if (paddle.distributed.get_rank() == 0 and args.
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(model, eval_dataloader, logger) results = evaluate(model, eval_dataloader, logger)
if results['f1'] > best_metirc['f1']: if results['f1'] >= best_metirc['f1']:
best_metirc = results best_metirc = results
output_dir = os.path.join(args.output_dir, output_dir = os.path.join(args.output_dir, "best_model")
"checkpoint-best")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir) model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
@ -240,10 +205,9 @@ def train(args):
logger.info("eval results: {}".format(results)) logger.info("eval results: {}".format(results))
logger.info("best_metirc: {}".format(best_metirc)) logger.info("best_metirc: {}".format(best_metirc))
if (paddle.distributed.get_rank() == 0 and args.save_steps > 0 and if paddle.distributed.get_rank() == 0:
global_step % args.save_steps == 0):
# Save model checkpoint # Save model checkpoint
output_dir = os.path.join(args.output_dir, "checkpoint-latest") output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0: if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir) model.save_pretrained(output_dir)
@ -252,6 +216,7 @@ def train(args):
os.path.join(output_dir, "training_args.bin")) os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format( logger.info("Saving model checkpoint to {}".format(
output_dir)) output_dir))
reader_start = time.time()
logger.info("best_metirc: {}".format(best_metirc)) logger.info("best_metirc: {}".format(best_metirc))

View File

@ -20,6 +20,7 @@ sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
import random import random
import time
import copy import copy
import logging import logging
@ -29,19 +30,11 @@ import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
from xfun import XFUNDataset from xfun import XFUNDataset
from utils import parse_args from utils import parse_args, get_bio_label_maps, print_arguments, set_seed
from utils import get_bio_label_maps from eval_ser import evaluate
from utils import print_arguments
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
paddle.seed(args.seed)
def train(args): def train(args):
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
@ -55,9 +48,15 @@ def train(args):
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path) if not args.resume:
model = LayoutXLMForTokenClassification( model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
base_model, num_classes=len(label2id_map), dropout=None) model = LayoutXLMForTokenClassification(
model, num_classes=len(label2id_map), dropout=None)
logger.info('train from scratch')
else:
logger.info('resume from {}'.format(args.model_name_or_path))
model = LayoutXLMForTokenClassification.from_pretrained(
args.model_name_or_path)
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if paddle.distributed.get_world_size() > 1:
@ -74,6 +73,17 @@ def train(args):
add_special_ids=False, add_special_ids=False,
return_attention_mask=True, return_attention_mask=True,
load_mode='all') load_mode='all')
eval_dataset = XFUNDataset(
tokenizer,
data_dir=args.eval_data_dir,
label_path=args.eval_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
pad_token_label_id=pad_token_label_id,
contains_re=False,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
train_sampler = paddle.io.DistributedBatchSampler( train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
@ -88,6 +98,13 @@ def train(args):
use_shared_memory=True, use_shared_memory=True,
collate_fn=None, ) collate_fn=None, )
eval_dataloader = paddle.io.DataLoader(
eval_dataset,
batch_size=args.per_gpu_eval_batch_size,
num_workers=0,
use_shared_memory=True,
collate_fn=None, )
t_total = len(train_dataloader) * args.num_train_epochs t_total = len(train_dataloader) * args.num_train_epochs
# build linear decay with warmup lr sch # build linear decay with warmup lr sch
@ -122,28 +139,49 @@ def train(args):
global_step = 0 global_step = 0
tr_loss = 0.0 tr_loss = 0.0
set_seed(args) set_seed(ags.seed)
best_metrics = None best_metrics = None
train_reader_cost = 0.0
train_run_cost = 0.0
total_samples = 0
reader_start = time.time()
print_step = 1
model.train()
for epoch_id in range(args.num_train_epochs): for epoch_id in range(args.num_train_epochs):
for step, batch in enumerate(train_dataloader): for step, batch in enumerate(train_dataloader):
model.train() train_reader_cost += time.time() - reader_start
train_start = time.time()
outputs = model(**batch) outputs = model(**batch)
train_run_cost += time.time() - train_start
# model outputs are always tuple in ppnlp (see doc) # model outputs are always tuple in ppnlp (see doc)
loss = outputs[0] loss = outputs[0]
loss = loss.mean() loss = loss.mean()
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {}, lr: {}".
format(epoch_id, args.num_train_epochs, step,
len(train_dataloader), global_step,
loss.numpy()[0], lr_scheduler.get_lr()))
loss.backward() loss.backward()
tr_loss += loss.item() tr_loss += loss.item()
optimizer.step() optimizer.step()
lr_scheduler.step() # Update learning rate schedule lr_scheduler.step() # Update learning rate schedule
optimizer.clear_grad() optimizer.clear_grad()
global_step += 1 global_step += 1
total_samples += batch['image'].shape[0]
if step % print_step == 0:
logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch_id, args.num_train_epochs, step,
len(train_dataloader), global_step,
loss.numpy()[0],
lr_scheduler.get_lr(), train_reader_cost /
print_step, (train_reader_cost + train_run_cost) /
print_step, total_samples / print_step, total_samples
/ (train_reader_cost + train_run_cost)))
train_reader_cost = 0.0
train_run_cost = 0.0
total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
global_step % args.eval_steps == 0): global_step % args.eval_steps == 0):
@ -151,9 +189,9 @@ def train(args):
# Only evaluate when single GPU otherwise metrics may not average well # Only evaluate when single GPU otherwise metrics may not average well
if paddle.distributed.get_rank( if paddle.distributed.get_rank(
) == 0 and args.evaluate_during_training: ) == 0 and args.evaluate_during_training:
results, _ = evaluate(args, model, tokenizer, label2id_map, results, _ = evaluate(
id2label_map, pad_token_label_id, args, model, tokenizer, eval_dataloader, label2id_map,
logger) id2label_map, pad_token_label_id, logger)
if best_metrics is None or results["f1"] >= best_metrics[ if best_metrics is None or results["f1"] >= best_metrics[
"f1"]: "f1"]:
@ -175,11 +213,9 @@ def train(args):
if best_metrics is not None: if best_metrics is not None:
logger.info("best metrics: {}".format(best_metrics)) logger.info("best metrics: {}".format(best_metrics))
if paddle.distributed.get_rank( if paddle.distributed.get_rank() == 0:
) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint # Save model checkpoint
output_dir = os.path.join(args.output_dir, output_dir = os.path.join(args.output_dir, "latest_model")
"checkpoint-{}".format(global_step))
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0: if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir) model.save_pretrained(output_dir)
@ -187,112 +223,10 @@ def train(args):
paddle.save(args, paddle.save(args,
os.path.join(output_dir, "training_args.bin")) os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir) logger.info("Saving model checkpoint to %s", output_dir)
reader_start = time.time()
return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def evaluate(args,
model,
tokenizer,
label2id_map,
id2label_map,
pad_token_label_id,
logger,
prefix=""):
eval_dataset = XFUNDataset(
tokenizer,
data_dir=args.eval_data_dir,
label_path=args.eval_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
pad_token_label_id=pad_token_label_id,
contains_re=False,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
1, paddle.distributed.get_world_size())
eval_dataloader = paddle.io.DataLoader(
eval_dataset,
batch_size=args.eval_batch_size,
num_workers=0,
use_shared_memory=True,
collate_fn=None, )
# Eval!
logger.info("***** Running evaluation %s *****", prefix)
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
model.eval()
for idx, batch in enumerate(eval_dataloader):
with paddle.no_grad():
outputs = model(**batch)
tmp_eval_loss, logits = outputs[:2]
tmp_eval_loss = tmp_eval_loss.mean()
if paddle.distributed.get_rank() == 0:
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
eval_loss += tmp_eval_loss.item()
nb_eval_steps += 1
if preds is None:
preds = logits.numpy()
out_label_ids = batch["labels"].numpy()
else:
preds = np.append(preds, logits.numpy(), axis=0)
out_label_ids = np.append(
out_label_ids, batch["labels"].numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=2)
# label_map = {i: label.upper() for i, label in enumerate(labels)}
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]
for i in range(out_label_ids.shape[0]):
for j in range(out_label_ids.shape[1]):
if out_label_ids[i, j] != pad_token_label_id:
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
preds_list[i].append(id2label_map[preds[i][j]])
results = {
"loss": eval_loss,
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
}
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
for lbl in out_label_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
for lbl in preds_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
report = classification_report(out_label_list, preds_list)
logger.info("\n" + report)
logger.info("***** Eval results %s *****", prefix)
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
return results, preds_list
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
train(args) train(args)

View File

@ -25,6 +25,12 @@ import paddle
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def get_bio_label_maps(label_map_path): def get_bio_label_maps(label_map_path):
with open(label_map_path, "r") as fin: with open(label_map_path, "r") as fin:
lines = fin.readlines() lines = fin.readlines()
@ -375,8 +381,6 @@ def parse_args():
help="Linear warmup over warmup_steps.",) help="Linear warmup over warmup_steps.",)
parser.add_argument("--eval_steps", type=int, default=10, parser.add_argument("--eval_steps", type=int, default=10,
help="eval every X updates steps.",) help="eval every X updates steps.",)
parser.add_argument("--save_steps", type=int, default=50,
help="Save checkpoint every X updates steps.",)
parser.add_argument("--seed", type=int, default=2048, parser.add_argument("--seed", type=int, default=2048,
help="random seed for initialization",) help="random seed for initialization",)
@ -385,6 +389,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, ) "--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
parser.add_argument("--infer_imgs", default=None, type=str, required=False) parser.add_argument("--infer_imgs", default=None, type=str, required=False)
parser.add_argument("--resume", action='store_true')
parser.add_argument("--ocr_json_path", default=None, parser.add_argument("--ocr_json_path", default=None,
type=str, required=False, help="ocr prediction results") type=str, required=False, help="ocr prediction results")
# yapf: enable # yapf: enable