# Copyright (c) Alibaba, Inc. and its affiliates.
import argparse

import numpy as np
import torch
import tqdm
from mmcv.parallel import scatter_kwargs
from torch.backends import cudnn

from easycv.datasets.builder import build_dataset
from easycv.datasets.loader import build_dataloader
from easycv.models.builder import build_model
from easycv.utils.config_tools import mmcv_config_fromfile
from easycv.utils.mmlab_utils import dynamic_adapt_for_mmlab


def parse_args():
    parser = argparse.ArgumentParser(
        description='EasyCV model memory and inference_time test')
    parser.add_argument('config', help='test config file path')
    parser.add_argument(
        '--repeat_num', default=300, type=int, help='repeat number')
    parser.add_argument(
        '--warmup_num', default=100, type=int, help='warm up number')
    parser.add_argument(
        '--gpu',
        default='0',
        type=str,
        choices=['0', '1', '2', '3', '4', '5', '6', '7'])

    args = parser.parse_args()
    return args


def main():
    cudnn.benchmark = True

    args = parse_args()

    cfg = mmcv_config_fromfile(args.config)

    # dynamic adapt mmdet models
    dynamic_adapt_for_mmlab(cfg)

    device = torch.device('cuda:{}'.format(args.gpu))
    model = build_model(cfg.model).to(device)
    model.eval()

    cfg.data.val.pop('imgs_per_gpu', None)  # pop useless params
    dataset = build_dataset(cfg.data.val)
    data_loader = build_dataloader(
        dataset,
        imgs_per_gpu=1,
        workers_per_gpu=0,
    )

    # Set up cuda events for measuring time. This is PyTorch's official recommended interface and should theoretically be the most reliable.
    starter = torch.cuda.Event(enable_timing=True)
    ender = torch.cuda.Event(enable_timing=True)
    # Initialize a time container.
    timings = np.zeros((args.repeat_num, 1))

    with torch.no_grad():
        for idx, data in zip(tqdm.trange(args.repeat_num), data_loader):
            _, kwargs = scatter_kwargs(None, data, [int(args.gpu)])
            inputs = kwargs[0]
            inputs.update(dict(mode='test'))
            # GPU may be hibernated to save energy at ordinary times, so it needs to be preheated.
            if idx < args.warmup_num:
                if idx == 0:
                    print('Start warm up ...')
                _ = model(**inputs)
                continue

            if idx == args.warmup_num:
                print('Warm up end, start to record time...')

            starter.record()
            _ = model(**inputs)
            ender.record()
            torch.cuda.synchronize()  # Wait for the GPU task to complete.
            curr_time = starter.elapsed_time(
                ender)  # The time between starter and ender, in milliseconds.
            timings[idx] = curr_time

    avg = timings.sum() / args.repeat_num
    print('Cuda memory: {}'.format(torch.cuda.memory_summary(device)))
    print('\ninference average time={}ms\n'.format(avg))


if __name__ == '__main__':
    main()