Merge pull request #4967 from WenmuZhou/fix_vqa

ser and re support distributed train
pull/4988/head
MissPenguin 2021-12-19 12:18:55 +08:00 committed by GitHub
commit 68deaab13f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 73 additions and 77 deletions

View File

@ -15,13 +15,12 @@
import os import os
import re import re
import sys import sys
# import Polygon
import shapely import shapely
from shapely.geometry import Polygon from shapely.geometry import Polygon
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
import operator import operator
import editdistance import Levenshtein
import argparse import argparse
import json import json
import copy import copy
@ -95,7 +94,7 @@ def ed(args, str1, str2):
if args.ignore_case: if args.ignore_case:
str1 = str1.lower() str1 = str1.lower()
str2 = str2.lower() str2 = str2.lower()
return editdistance.eval(str1, str2) return Levenshtein.distance(str1, str2)
def convert_bbox_to_polygon(bbox): def convert_bbox_to_polygon(bbox):
@ -115,8 +114,6 @@ def eval_e2e(args):
# pred # pred
dt_results = parse_ser_results_fp(args.pred_json_path, "pred", dt_results = parse_ser_results_fp(args.pred_json_path, "pred",
args.ignore_background) args.ignore_background)
assert set(gt_results.keys()) == set(dt_results.keys())
iou_thresh = args.iou_thres iou_thresh = args.iou_thres
num_gt_chars = 0 num_gt_chars = 0
gt_count = 0 gt_count = 0
@ -124,7 +121,7 @@ def eval_e2e(args):
hit = 0 hit = 0
ed_sum = 0 ed_sum = 0
for img_name in gt_results: for img_name in dt_results:
gt_info = gt_results[img_name] gt_info = gt_results[img_name]
gt_count += len(gt_info) gt_count += len(gt_info)

View File

@ -36,6 +36,9 @@ from ppocr.utils.logging import get_logger
def train(args): def train(args):
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
rank = paddle.distributed.get_rank()
distributed = paddle.distributed.get_world_size() > 1
print_arguments(args, logger) print_arguments(args, logger)
# Added here for reproducibility (even between python 2 and 3) # Added here for reproducibility (even between python 2 and 3)
@ -45,7 +48,7 @@ def train(args):
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if distributed:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
@ -59,8 +62,8 @@ def train(args):
args.model_name_or_path) args.model_name_or_path)
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if distributed:
model = paddle.distributed.DataParallel(model) model = paddle.DataParallel(model)
train_dataset = XFUNDataset( train_dataset = XFUNDataset(
tokenizer, tokenizer,
@ -90,8 +93,7 @@ def train(args):
train_sampler = paddle.io.DistributedBatchSampler( train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
args.train_batch_size = args.per_gpu_train_batch_size * \
max(1, paddle.distributed.get_world_size())
train_dataloader = paddle.io.DataLoader( train_dataloader = paddle.io.DataLoader(
train_dataset, train_dataset,
batch_sampler=train_sampler, batch_sampler=train_sampler,
@ -136,7 +138,8 @@ def train(args):
args.per_gpu_train_batch_size)) args.per_gpu_train_batch_size))
logger.info( logger.info(
" Total train batch size (w. parallel, distributed & accumulation) = {}". " Total train batch size (w. parallel, distributed & accumulation) = {}".
format(args.train_batch_size * paddle.distributed.get_world_size())) format(args.per_gpu_train_batch_size *
paddle.distributed.get_world_size()))
logger.info(" Total optimization steps = {}".format(t_total)) logger.info(" Total optimization steps = {}".format(t_total))
global_step = 0 global_step = 0
@ -170,7 +173,7 @@ def train(args):
global_step += 1 global_step += 1
total_samples += batch['image'].shape[0] total_samples += batch['image'].shape[0]
if step % print_step == 0: if rank == 0 and step % print_step == 0:
logger.info( logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch, args.num_train_epochs, step, format(epoch, args.num_train_epochs, step,
@ -185,38 +188,38 @@ def train(args):
train_run_cost = 0.0 train_run_cost = 0.0
total_samples = 0 total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
global_step % args.eval_steps == 0):
# Log metrics # Log metrics
if (paddle.distributed.get_rank() == 0 and args. # Only evaluate when single GPU otherwise metrics may not average well
evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(model, eval_dataloader, logger)
results = evaluate(model, eval_dataloader, logger) if results['f1'] >= best_metirc['f1']:
if results['f1'] >= best_metirc['f1']: best_metirc = results
best_metirc = results output_dir = os.path.join(args.output_dir, "best_model")
output_dir = os.path.join(args.output_dir, "best_model") os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True) if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir) model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args,
os.path.join(output_dir,
"training_args.bin"))
logger.info("Saving model checkpoint to {}".format(
output_dir))
logger.info("eval results: {}".format(results))
logger.info("best_metirc: {}".format(best_metirc))
if paddle.distributed.get_rank() == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
paddle.save(args, paddle.save(args,
os.path.join(output_dir, "training_args.bin")) os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format( logger.info("Saving model checkpoint to {}".format(
output_dir)) output_dir))
logger.info("eval results: {}".format(results))
logger.info("best_metirc: {}".format(best_metirc))
reader_start = time.time() reader_start = time.time()
if rank == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args, os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to {}".format(output_dir))
logger.info("best_metirc: {}".format(best_metirc)) logger.info("best_metirc: {}".format(best_metirc))

View File

@ -37,6 +37,9 @@ from ppocr.utils.logging import get_logger
def train(args): def train(args):
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
rank = paddle.distributed.get_rank()
distributed = paddle.distributed.get_world_size() > 1
logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
print_arguments(args, logger) print_arguments(args, logger)
@ -44,7 +47,7 @@ def train(args):
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if distributed:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
@ -59,7 +62,7 @@ def train(args):
args.model_name_or_path) args.model_name_or_path)
# dist mode # dist mode
if paddle.distributed.get_world_size() > 1: if distributed:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
train_dataset = XFUNDataset( train_dataset = XFUNDataset(
@ -88,9 +91,6 @@ def train(args):
train_sampler = paddle.io.DistributedBatchSampler( train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
args.train_batch_size = args.per_gpu_train_batch_size * max(
1, paddle.distributed.get_world_size())
train_dataloader = paddle.io.DataLoader( train_dataloader = paddle.io.DataLoader(
train_dataset, train_dataset,
batch_sampler=train_sampler, batch_sampler=train_sampler,
@ -134,7 +134,7 @@ def train(args):
args.per_gpu_train_batch_size) args.per_gpu_train_batch_size)
logger.info( logger.info(
" Total train batch size (w. parallel, distributed) = %d", " Total train batch size (w. parallel, distributed) = %d",
args.train_batch_size * paddle.distributed.get_world_size(), ) args.per_gpu_train_batch_size * paddle.distributed.get_world_size(), )
logger.info(" Total optimization steps = %d", t_total) logger.info(" Total optimization steps = %d", t_total)
global_step = 0 global_step = 0
@ -168,7 +168,7 @@ def train(args):
global_step += 1 global_step += 1
total_samples += batch['image'].shape[0] total_samples += batch['image'].shape[0]
if step % print_step == 0: if rank == 0 and step % print_step == 0:
logger.info( logger.info(
"epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
format(epoch_id, args.num_train_epochs, step, format(epoch_id, args.num_train_epochs, step,
@ -183,47 +183,43 @@ def train(args):
train_run_cost = 0.0 train_run_cost = 0.0
total_samples = 0 total_samples = 0
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
global_step % args.eval_steps == 0):
# Log metrics # Log metrics
# Only evaluate when single GPU otherwise metrics may not average well # Only evaluate when single GPU otherwise metrics may not average well
if paddle.distributed.get_rank( results, _ = evaluate(args, model, tokenizer, eval_dataloader,
) == 0 and args.evaluate_during_training: label2id_map, id2label_map,
results, _ = evaluate( pad_token_label_id, logger)
args, model, tokenizer, eval_dataloader, label2id_map,
id2label_map, pad_token_label_id, logger)
if best_metrics is None or results["f1"] >= best_metrics[ if best_metrics is None or results["f1"] >= best_metrics["f1"]:
"f1"]: best_metrics = copy.deepcopy(results)
best_metrics = copy.deepcopy(results) output_dir = os.path.join(args.output_dir, "best_model")
output_dir = os.path.join(args.output_dir, "best_model") os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True) if distributed:
if paddle.distributed.get_rank() == 0: model._layers.save_pretrained(output_dir)
model.save_pretrained(output_dir) else:
tokenizer.save_pretrained(output_dir) model.save_pretrained(output_dir)
paddle.save(
args,
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s",
output_dir)
logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
epoch_id, args.num_train_epochs, step,
len(train_dataloader), results))
if best_metrics is not None:
logger.info("best metrics: {}".format(best_metrics))
if paddle.distributed.get_rank() == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
paddle.save(args, paddle.save(args,
os.path.join(output_dir, "training_args.bin")) os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir) logger.info("Saving model checkpoint to %s", output_dir)
logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
epoch_id, args.num_train_epochs, step,
len(train_dataloader), results))
if best_metrics is not None:
logger.info("best metrics: {}".format(best_metrics))
reader_start = time.time() reader_start = time.time()
if rank == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, "latest_model")
os.makedirs(output_dir, exist_ok=True)
if distributed:
model._layers.save_pretrained(output_dir)
else:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args, os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir)
return global_step, tr_loss / global_step return global_step, tr_loss / global_step