mmselfsup/docs/zh_cn/stat.py

#!/usr/bin/env python
import re
from collections import defaultdict
from pathlib import Path

from modelindex.load_model_index import load

MMSELFSUP_ROOT = Path(__file__).absolute().parents[2]
PAPERS_ROOT = Path('papers')  # Path to save generated paper pages.
GITHUB_PREFIX = 'https://github.com/open-mmlab/mmselfsup/blob/1.x/'
MODELZOO_TEMPLATE = """
# 模型库数据汇总

* Number of papers: {num_papers}
{type_msg}

* Number of checkpoints: {num_ckpts}
{paper_msg}
"""

model_index = load(str(MMSELFSUP_ROOT / 'model-index.yml'))


def build_collections(model_index):
    col_by_name = {}
    for col in model_index.collections:
        setattr(col, 'models', [])
        col_by_name[col.name] = col

    for model in model_index.models:
        col = col_by_name[model.in_collection]
        col.models.append(model)
        setattr(model, 'collection', col)


build_collections(model_index)


def count_papers(model_index):
    ckpt_dict = dict()
    type_count = defaultdict(int)
    paper_msgs = []

    for model in model_index.models:
        if model.collection.name in ckpt_dict.keys():
            if model.weights:
                ckpt_dict[model.collection.name] += 1
        else:
            ckpt_dict[model.collection.name] = 1

        downstream_info = model.data.get('Downstream', [])
        for downstream_task in downstream_info:
            if downstream_task.get('Weights', None):
                ckpt_dict[model.collection.name] += 1

    for collection in model_index.collections:
        name = collection.name
        title = collection.paper['Title']
        papertype = collection.data.get('type', 'Algorithm')
        type_count[papertype] += 1

        with open(MMSELFSUP_ROOT / collection.readme) as f:
            readme = f.read()
        readme = PAPERS_ROOT / Path(
            collection.filepath).parent.with_suffix('.md').name
        paper_msgs.append(
            f'\t- [{papertype}] [{title}]({readme}) ({ckpt_dict[name]} '
            f'ckpts)')

    type_msg = '\n'.join(
        [f'\t- {type_}: {count}' for type_, count in type_count.items()])
    paper_msg = '\n'.join(paper_msgs)

    modelzoo = MODELZOO_TEMPLATE.format(
        num_papers=sum(type_count.values()),
        num_ckpts=sum(ckpt_dict.values()),
        type_msg=type_msg,
        paper_msg=paper_msg,
    )

    with open('model_zoo_statistics.md', 'w') as f:
        f.write(modelzoo)


count_papers(model_index)


def generate_paper_page(collection):
    PAPERS_ROOT.mkdir(exist_ok=True)

    # Write a copy of README
    with open(MMSELFSUP_ROOT / collection.readme) as f:
        readme = f.read()
    folder = Path(collection.filepath).parent
    copy = PAPERS_ROOT / folder.with_suffix('.md').name

    def replace_link(matchobj):
        # Replace relative link to GitHub link.
        name = matchobj.group(1)
        link = matchobj.group(2)
        if not link.startswith('http'):
            assert (folder / link).exists(), \
                f'Link not found:\n{collection.readme}: {link}'
            rel_link = (folder / link).absolute().relative_to(MMSELFSUP_ROOT)
            link = GITHUB_PREFIX + str(rel_link)
        return f'[{name}]({link})'

    content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, readme)

    with open(copy, 'w') as copy_file:
        copy_file.write(content)


for collection in model_index.collections:
    generate_paper_page(collection)