support to specify rank to log when using Fleet API (#3039)

* support to specify rank to log when using Fleet API

* log max mem reserved

* log_ranks support str type

example: -o Global.log_ranks="0,1"

* log max mem allocated

* support to specify rank to log in static mode

* log max mem reserved and max mem allocated in static mode
pull/3041/head
Tingquan Gao 2023-11-16 11:32:29 +08:00 committed by GitHub
parent cc0ac63fb8
commit ab087065e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 54 additions and 33 deletions

View File

@ -74,7 +74,8 @@ class Engine(object):
# init logger
self.output_dir = self.config['Global']['output_dir']
log_file = os.path.join(self.output_dir, f"{mode}.log")
init_logger(log_file=log_file)
log_ranks = self.config['Global'].get("log_ranks", "0")
init_logger(log_file=log_file, log_ranks=log_ranks)
print_config(config)
# init train_func and eval_func
@ -100,8 +101,9 @@ class Engine(object):
self.vdl_writer = LogWriter(logdir=vdl_writer_path)
# set device
assert self.config["Global"][
"device"] in ["cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"]
assert self.config["Global"]["device"] in [
"cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
]
self.device = paddle.set_device(self.config["Global"]["device"])
logger.info('train with paddle {} and device {}'.format(
paddle.__version__, self.device))
@ -487,9 +489,8 @@ class Engine(object):
False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
model = ExportModel(self.config["Arch"], self.model, use_multilabel)
if self.config["Global"]["pretrained_model"] is not None:
load_dygraph_pretrain(
model.base_model,
self.config["Global"]["pretrained_model"])
load_dygraph_pretrain(model.base_model,
self.config["Global"]["pretrained_model"])
model.eval()
@ -517,7 +518,8 @@ class Engine(object):
paddle.jit.save(model, save_path)
if self.config["Global"].get("export_for_fd", False):
src_path = self.config["Global"]["infer_config_path"]
dst_path = os.path.join(self.config["Global"]["save_inference_dir"], 'inference.yml')
dst_path = os.path.join(
self.config["Global"]["save_inference_dir"], 'inference.yml')
shutil.copy(src_path, dst_path)
logger.info(
f"Export succeeded! The inference model exported has been saved in \"{self.config['Global']['save_inference_dir']}\"."

View File

@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import absolute_import, division, print_function
import paddle
import datetime
from ppcls.utils import logger
from ppcls.utils.misc import AverageMeter
@ -54,13 +55,16 @@ def log_info(trainer, batch_size, epoch_id, iter_id):
ips_msg = "ips: {:.5f} samples/s".format(
batch_size / trainer.time_info["batch_cost"].avg)
global_epochs = trainer.config["Global"]["epochs"]
eta_sec = (
(trainer.config["Global"]["epochs"] - epoch_id + 1) *
trainer.iter_per_epoch - iter_id) * trainer.time_info["batch_cost"].avg
eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
epoch_id, trainer.config["Global"]["epochs"], iter_id, trainer.
iter_per_epoch, lr_msg, metric_msg, time_msg, ips_msg, eta_msg))
max_mem_reserved_msg = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
max_mem_allocated_msg = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
logger.info(
f"[Train][Epoch {epoch_id}/{global_epochs}][Iter: {iter_id}/{trainer.iter_per_epoch}]{lr_msg}, {metric_msg}, {time_msg}, {ips_msg}, {eta_msg}, {max_mem_reserved_msg}, {max_mem_allocated_msg}"
)
for i, lr in enumerate(trainer.lr_sch):
logger.scaler(

View File

@ -419,10 +419,12 @@ def run(dataloader,
else:
epoch_str = "epoch:{:<3d}".format(epoch)
step_str = "{:s} step:{:<4d}".format(mode, idx)
max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
if idx % config.get('print_interval', 10) == 0:
logger.info("{:s} {:s} {:s}".format(epoch_str, step_str,
fetchs_str))
logger.info(
f"{epoch_str} {step_str} {fetchs_str} {max_mem_reserved_str} {max_mem_allocated_str}"
)
tic = time.time()

View File

@ -83,7 +83,8 @@ def main(args):
log_file = os.path.join(global_config['output_dir'],
config["Arch"]["name"], f"{mode}.log")
init_logger(log_file=log_file)
log_ranks = config["Global"].get("log_ranks", "0")
init_logger(log_file=log_file, log_ranks=log_ranks)
print_config(config)
if global_config.get("is_distributed", True):

View File

@ -22,7 +22,25 @@ import paddle.distributed as dist
_logger = None
def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
class LoggerHook(object):
"""
logs will print multi-times when calling Fleet API.
Commonly, only need to display single log at rank0 and ignore the others.
"""
block = False
def __init__(self, log):
self.log = log
def __call__(self, *args, **kwargs):
if not self.block:
self.log(*args, **kwargs)
def init_logger(name='ppcls',
log_file=None,
log_level=logging.INFO,
log_ranks="0"):
"""Initialize and get a logger by name.
If the logger has not been initialized, this method will initialize the
logger by adding one or two handlers, otherwise the initialized logger will
@ -35,6 +53,7 @@ def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
log_level (int): The logger level. Note that only the process of
rank 0 is affected, and other processes will set the level to
"Error" thus be silent most of the time.
log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default.
Returns:
logging.Logger: The expected logger.
"""
@ -78,42 +97,35 @@ def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
if i == len(_logger.handlers) - 1:
_logger.addHandler(file_handler)
if dist.get_rank() == 0:
if isinstance(log_ranks, str):
log_ranks = [int(i) for i in log_ranks.split(',')]
elif isinstance(log_ranks, int):
log_ranks = [log_ranks]
if dist.get_rank() in log_ranks:
_logger.setLevel(log_level)
LoggerHook.block = False
else:
_logger.setLevel(logging.ERROR)
LoggerHook.block = True
_logger.propagate = False
def log_at_trainer0(log):
"""
logs will print multi-times when calling Fleet API.
Only display single log and ignore the others.
"""
def wrapper(fmt, *args):
if dist.get_rank() == 0:
log(fmt, *args)
return wrapper
@log_at_trainer0
@LoggerHook
def info(fmt, *args):
_logger.info(fmt, *args)
@log_at_trainer0
@LoggerHook
def debug(fmt, *args):
_logger.debug(fmt, *args)
@log_at_trainer0
@LoggerHook
def warning(fmt, *args):
_logger.warning(fmt, *args)
@log_at_trainer0
@LoggerHook
def error(fmt, *args):
_logger.error(fmt, *args)