support to specify rank to log when using Fleet API (#3039)
* support to specify rank to log when using Fleet API * log max mem reserved * log_ranks support str type example: -o Global.log_ranks="0,1" * log max mem allocated * support to specify rank to log in static mode * log max mem reserved and max mem allocated in static modepull/3041/head
parent
cc0ac63fb8
commit
ab087065e9
|
@ -74,7 +74,8 @@ class Engine(object):
|
|||
# init logger
|
||||
self.output_dir = self.config['Global']['output_dir']
|
||||
log_file = os.path.join(self.output_dir, f"{mode}.log")
|
||||
init_logger(log_file=log_file)
|
||||
log_ranks = self.config['Global'].get("log_ranks", "0")
|
||||
init_logger(log_file=log_file, log_ranks=log_ranks)
|
||||
print_config(config)
|
||||
|
||||
# init train_func and eval_func
|
||||
|
@ -100,8 +101,9 @@ class Engine(object):
|
|||
self.vdl_writer = LogWriter(logdir=vdl_writer_path)
|
||||
|
||||
# set device
|
||||
assert self.config["Global"][
|
||||
"device"] in ["cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"]
|
||||
assert self.config["Global"]["device"] in [
|
||||
"cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
|
||||
]
|
||||
self.device = paddle.set_device(self.config["Global"]["device"])
|
||||
logger.info('train with paddle {} and device {}'.format(
|
||||
paddle.__version__, self.device))
|
||||
|
@ -487,9 +489,8 @@ class Engine(object):
|
|||
False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
|
||||
model = ExportModel(self.config["Arch"], self.model, use_multilabel)
|
||||
if self.config["Global"]["pretrained_model"] is not None:
|
||||
load_dygraph_pretrain(
|
||||
model.base_model,
|
||||
self.config["Global"]["pretrained_model"])
|
||||
load_dygraph_pretrain(model.base_model,
|
||||
self.config["Global"]["pretrained_model"])
|
||||
|
||||
model.eval()
|
||||
|
||||
|
@ -517,7 +518,8 @@ class Engine(object):
|
|||
paddle.jit.save(model, save_path)
|
||||
if self.config["Global"].get("export_for_fd", False):
|
||||
src_path = self.config["Global"]["infer_config_path"]
|
||||
dst_path = os.path.join(self.config["Global"]["save_inference_dir"], 'inference.yml')
|
||||
dst_path = os.path.join(
|
||||
self.config["Global"]["save_inference_dir"], 'inference.yml')
|
||||
shutil.copy(src_path, dst_path)
|
||||
logger.info(
|
||||
f"Export succeeded! The inference model exported has been saved in \"{self.config['Global']['save_inference_dir']}\"."
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import paddle
|
||||
import datetime
|
||||
from ppcls.utils import logger
|
||||
from ppcls.utils.misc import AverageMeter
|
||||
|
@ -54,13 +55,16 @@ def log_info(trainer, batch_size, epoch_id, iter_id):
|
|||
ips_msg = "ips: {:.5f} samples/s".format(
|
||||
batch_size / trainer.time_info["batch_cost"].avg)
|
||||
|
||||
global_epochs = trainer.config["Global"]["epochs"]
|
||||
eta_sec = (
|
||||
(trainer.config["Global"]["epochs"] - epoch_id + 1) *
|
||||
trainer.iter_per_epoch - iter_id) * trainer.time_info["batch_cost"].avg
|
||||
eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
|
||||
logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
|
||||
epoch_id, trainer.config["Global"]["epochs"], iter_id, trainer.
|
||||
iter_per_epoch, lr_msg, metric_msg, time_msg, ips_msg, eta_msg))
|
||||
max_mem_reserved_msg = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
|
||||
max_mem_allocated_msg = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
|
||||
logger.info(
|
||||
f"[Train][Epoch {epoch_id}/{global_epochs}][Iter: {iter_id}/{trainer.iter_per_epoch}]{lr_msg}, {metric_msg}, {time_msg}, {ips_msg}, {eta_msg}, {max_mem_reserved_msg}, {max_mem_allocated_msg}"
|
||||
)
|
||||
|
||||
for i, lr in enumerate(trainer.lr_sch):
|
||||
logger.scaler(
|
||||
|
|
|
@ -419,10 +419,12 @@ def run(dataloader,
|
|||
else:
|
||||
epoch_str = "epoch:{:<3d}".format(epoch)
|
||||
step_str = "{:s} step:{:<4d}".format(mode, idx)
|
||||
|
||||
max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
|
||||
max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
|
||||
if idx % config.get('print_interval', 10) == 0:
|
||||
logger.info("{:s} {:s} {:s}".format(epoch_str, step_str,
|
||||
fetchs_str))
|
||||
logger.info(
|
||||
f"{epoch_str} {step_str} {fetchs_str} {max_mem_reserved_str} {max_mem_allocated_str}"
|
||||
)
|
||||
|
||||
tic = time.time()
|
||||
|
||||
|
|
|
@ -83,7 +83,8 @@ def main(args):
|
|||
|
||||
log_file = os.path.join(global_config['output_dir'],
|
||||
config["Arch"]["name"], f"{mode}.log")
|
||||
init_logger(log_file=log_file)
|
||||
log_ranks = config["Global"].get("log_ranks", "0")
|
||||
init_logger(log_file=log_file, log_ranks=log_ranks)
|
||||
print_config(config)
|
||||
|
||||
if global_config.get("is_distributed", True):
|
||||
|
|
|
@ -22,7 +22,25 @@ import paddle.distributed as dist
|
|||
_logger = None
|
||||
|
||||
|
||||
def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
|
||||
class LoggerHook(object):
|
||||
"""
|
||||
logs will print multi-times when calling Fleet API.
|
||||
Commonly, only need to display single log at rank0 and ignore the others.
|
||||
"""
|
||||
block = False
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
if not self.block:
|
||||
self.log(*args, **kwargs)
|
||||
|
||||
|
||||
def init_logger(name='ppcls',
|
||||
log_file=None,
|
||||
log_level=logging.INFO,
|
||||
log_ranks="0"):
|
||||
"""Initialize and get a logger by name.
|
||||
If the logger has not been initialized, this method will initialize the
|
||||
logger by adding one or two handlers, otherwise the initialized logger will
|
||||
|
@ -35,6 +53,7 @@ def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
|
|||
log_level (int): The logger level. Note that only the process of
|
||||
rank 0 is affected, and other processes will set the level to
|
||||
"Error" thus be silent most of the time.
|
||||
log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default.
|
||||
Returns:
|
||||
logging.Logger: The expected logger.
|
||||
"""
|
||||
|
@ -78,42 +97,35 @@ def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
|
|||
if i == len(_logger.handlers) - 1:
|
||||
_logger.addHandler(file_handler)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
if isinstance(log_ranks, str):
|
||||
log_ranks = [int(i) for i in log_ranks.split(',')]
|
||||
elif isinstance(log_ranks, int):
|
||||
log_ranks = [log_ranks]
|
||||
if dist.get_rank() in log_ranks:
|
||||
_logger.setLevel(log_level)
|
||||
LoggerHook.block = False
|
||||
else:
|
||||
_logger.setLevel(logging.ERROR)
|
||||
LoggerHook.block = True
|
||||
_logger.propagate = False
|
||||
|
||||
|
||||
def log_at_trainer0(log):
|
||||
"""
|
||||
logs will print multi-times when calling Fleet API.
|
||||
Only display single log and ignore the others.
|
||||
"""
|
||||
|
||||
def wrapper(fmt, *args):
|
||||
if dist.get_rank() == 0:
|
||||
log(fmt, *args)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@log_at_trainer0
|
||||
@LoggerHook
|
||||
def info(fmt, *args):
|
||||
_logger.info(fmt, *args)
|
||||
|
||||
|
||||
@log_at_trainer0
|
||||
@LoggerHook
|
||||
def debug(fmt, *args):
|
||||
_logger.debug(fmt, *args)
|
||||
|
||||
|
||||
@log_at_trainer0
|
||||
@LoggerHook
|
||||
def warning(fmt, *args):
|
||||
_logger.warning(fmt, *args)
|
||||
|
||||
|
||||
@log_at_trainer0
|
||||
@LoggerHook
|
||||
def error(fmt, *args):
|
||||
_logger.error(fmt, *args)
|
||||
|
||||
|
|
Loading…
Reference in New Issue