mmclassification/mmpretrain/datasets/coco_vqa.py

# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import re
from collections import Counter
from typing import List

import mmengine
from mmengine.dataset import BaseDataset

from mmpretrain.registry import DATASETS


@DATASETS.register_module()
class COCOVQA(BaseDataset):
    """VQAv2 dataset.

    Args:
        data_root (str): The root directory for ``data_prefix``, ``ann_file``
            and ``question_file``.
        data_prefix (str): The directory of images.
        question_file (str): Question file path.
        ann_file (str, optional): Annotation file path for training and
            validation. Defaults to an empty string.
        **kwargs: Other keyword arguments in :class:`BaseDataset`.
    """

    def __init__(self,
                 data_root: str,
                 data_prefix: str,
                 question_file: str,
                 ann_file: str = '',
                 **kwarg):
        self.question_file = question_file
        super().__init__(
            data_root=data_root,
            data_prefix=dict(img_path=data_prefix),
            ann_file=ann_file,
            **kwarg,
        )

    def _join_prefix(self):
        if not mmengine.is_abs(self.question_file) and self.question_file:
            self.question_file = osp.join(self.data_root, self.question_file)

        return super()._join_prefix()

    def _create_image_index(self):
        img_prefix = self.data_prefix['img_path']

        files = mmengine.list_dir_or_file(img_prefix, list_dir=False)
        image_index = {}
        for file in files:
            image_id = re.findall(r'\d{12}', file)
            if len(image_id) > 0:
                image_id = int(image_id[-1])
                image_index[image_id] = mmengine.join_path(img_prefix, file)

        return image_index

    def load_data_list(self) -> List[dict]:
        """Load data list."""
        questions = mmengine.load(self.question_file)['questions']
        if self.ann_file:
            annotations = mmengine.load(self.ann_file)['annotations']
            assert len(questions) == len(annotations)
        else:
            annotations = [None] * len(questions)

        # The original VQAv2 annotation file and question file includes
        # only image id but no image file paths.
        self.image_index = self._create_image_index()

        data_list = []
        for question, ann in zip(questions, annotations):
            # question example
            # {
            #     'image_id': 262144,
            #     'question': "Is the ball flying towards the batter?",
            #     'question_id': 262144000
            # }
            #
            # ann example
            # {
            #     'question_type': "what are the",
            #     'answer_type': "other",
            #     'answers': [
            #         {'answer': 'watching',
            #          'answer_id': 1,
            #          'answer_confidence': 'yes'},
            #         ...
            #     ],
            #     'image_id': 262148,
            #     'question_id': 262148000,
            #     'multiple_choice_answer': 'watching',
            #     'answer_type': 'other',
            # }

            data_info = question
            data_info['img_path'] = self.image_index[question['image_id']]

            if ann is not None:
                assert ann['question_id'] == question['question_id']

                # add answer_weight & answer_count, delete duplicate answer
                answers = [item['answer'] for item in ann.pop('answers')]
                count = Counter(answers)
                answer_weight = [i / len(answers) for i in count.values()]
                data_info['gt_answer'] = list(count.keys())
                data_info['gt_answer_weight'] = answer_weight
                data_info.update(ann)

            data_list.append(data_info)

        return data_list
[Feature] Support multiple multi-modal algorithms and inferencers. (#1561) * [Feat] Migrate blip caption to mmpretrain. (#50) * Migrate blip caption to mmpretrain * minor fix * support train * [Feature] Support OFA caption task. (#51) * [Feature] Support OFA caption task. * Remove duplicated files. * [Feature] Support OFA vqa task. (#58) * [Feature] Support OFA vqa task. * Fix lint. * [Feat] Add BLIP retrieval to mmpretrain. (#55) * init * minor fix for train * fix according to comments * refactor * Update Blip retrieval. (#62) * [Feature] Support OFA visual grounding task. (#59) * [Feature] Support OFA visual grounding task. * minor add TODO --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Add flamingos coco caption and vqa. (#60) * first init * init flamingo coco * add vqa * minor fix * remove unnecessary modules * Update config * Use `ApplyToList`. --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 coco retrieval (#53) * [Feature]: Add blip2 retriever * [Feature]: Add blip2 all modules * [Feature]: Refine model * [Feature]: x1 * [Feature]: Runnable coco ret * [Feature]: Runnable version * [Feature]: Fix lint * [Fix]: Fix lint * [Feature]: Use 364 img size * [Feature]: Refactor blip2 * [Fix]: Fix lint * refactor files * minor fix * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Remove * fix blip caption inputs (#68) * [Feat] Add BLIP NLVR support. (#67) * first init * init flamingo coco * add vqa * add nlvr * refactor nlvr * minor fix * minor fix * Update dataset --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 Caption (#70) * [Feature]: Add language model * [Feature]: blip2 caption forward * [Feature]: Reproduce the results * [Feature]: Refactor caption * refine config --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Migrate BLIP VQA to mmpretrain (#69) * reformat * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * refactor code --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Update RefCOCO dataset * [Fix] fix lint * [Feature] Implement inference APIs for multi-modal tasks. (#65) * [Feature] Implement inference APIs for multi-modal tasks. * [Project] Add gradio demo. * [Improve] Update requirements * Update flamingo * Update blip * Add NLVR inferencer * Update flamingo * Update hugging face model register * Update ofa vqa * Update BLIP-vqa (#71) * Update blip-vqa docstring (#72) * Refine flamingo docstring (#73) * [Feature]: BLIP2 VQA (#61) * [Feature]: VQA forward * [Feature]: Reproduce accuracy * [Fix]: Fix lint * [Fix]: Add blank line * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feature]: BLIP2 docstring (#74) * [Feature]: Add caption docstring * [Feature]: Add docstring to blip2 vqa * [Feature]: Add docstring to retrieval * Update BLIP-2 metafile and README (#75) * [Feature]: Add readme and docstring * Update blip2 results --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature] BLIP Visual Grounding on MMPretrain Branch (#66) * blip grounding merge with mmpretrain * remove commit * blip grounding test and inference api * refcoco dataset * refcoco dataset refine config * rebasing * gitignore * rebasing * minor edit * minor edit * Update blip-vqa docstring (#72) * rebasing * Revert "minor edit" This reverts commit 639cec757c215e654625ed0979319e60f0be9044. * blip grounding final * precommit * refine config * refine config * Update blip visual grounding --------- Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: mzr1996 <mzr1996@163.com> * Update visual grounding metric * Update OFA docstring, README and metafiles. (#76) * [Docs] Update installation docs and gradio demo docs. (#77) * Update OFA name * Update Visual Grounding Visualizer * Integrate accelerate support * Fix imports. * Fix timm backbone * Update imports * Update README * Update circle ci * Update flamingo config * Add gradio demo README * [Feature]: Add scienceqa (#1571) * [Feature]: Add scienceqa * [Feature]: Change param name * Update docs * Update video --------- Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> Co-authored-by: yingfhu <yingfhu@gmail.com> Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: Rongjie Li <limo97@163.com> 2023-05-19 16:50:04 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
			`import os.path as osp`
			`import re`
			`from collections import Counter`
			`from typing import List`

			`import mmengine`
			`from mmengine.dataset import BaseDataset`

			`from mmpretrain.registry import DATASETS`


			`@DATASETS.register_module()`
			`class COCOVQA(BaseDataset):`
			`"""VQAv2 dataset.`

			`Args:`
			data_root (str): The root directory for ``data_prefix``, ``ann_file``
			and ``question_file``.
			`data_prefix (str): The directory of images.`
			`question_file (str): Question file path.`
			`ann_file (str, optional): Annotation file path for training and`
			`validation. Defaults to an empty string.`
			**kwargs: Other keyword arguments in :class:`BaseDataset`.
			`"""`

			`def __init__(self,`
			`data_root: str,`
			`data_prefix: str,`
			`question_file: str,`
			`ann_file: str = '',`
			`**kwarg):`
			`self.question_file = question_file`
			`super().__init__(`
			`data_root=data_root,`
			`data_prefix=dict(img_path=data_prefix),`
			`ann_file=ann_file,`
			`**kwarg,`
			`)`

			`def _join_prefix(self):`
			`if not mmengine.is_abs(self.question_file) and self.question_file:`
			`self.question_file = osp.join(self.data_root, self.question_file)`

			`return super()._join_prefix()`

			`def _create_image_index(self):`
			`img_prefix = self.data_prefix['img_path']`

			`files = mmengine.list_dir_or_file(img_prefix, list_dir=False)`
			`image_index = {}`
			`for file in files:`
			`image_id = re.findall(r'\d{12}', file)`
			`if len(image_id) > 0:`
			`image_id = int(image_id[-1])`
			`image_index[image_id] = mmengine.join_path(img_prefix, file)`

			`return image_index`

			`def load_data_list(self) -> List[dict]:`
			`"""Load data list."""`
			`questions = mmengine.load(self.question_file)['questions']`
			`if self.ann_file:`
			`annotations = mmengine.load(self.ann_file)['annotations']`
			`assert len(questions) == len(annotations)`
			`else:`
			`annotations = [None] * len(questions)`

			`# The original VQAv2 annotation file and question file includes`
			`# only image id but no image file paths.`
			`self.image_index = self._create_image_index()`

			`data_list = []`
			`for question, ann in zip(questions, annotations):`
			`# question example`
			`# {`
			`# 'image_id': 262144,`
			`# 'question': "Is the ball flying towards the batter?",`
			`# 'question_id': 262144000`
			`# }`
			`#`
			`# ann example`
			`# {`
			`# 'question_type': "what are the",`
			`# 'answer_type': "other",`
			`# 'answers': [`
			`# {'answer': 'watching',`
			`# 'answer_id': 1,`
			`# 'answer_confidence': 'yes'},`
			`# ...`
			`# ],`
			`# 'image_id': 262148,`
			`# 'question_id': 262148000,`
			`# 'multiple_choice_answer': 'watching',`
			`# 'answer_type': 'other',`
			`# }`

			`data_info = question`
			`data_info['img_path'] = self.image_index[question['image_id']]`

			`if ann is not None:`
			`assert ann['question_id'] == question['question_id']`

			`# add answer_weight & answer_count, delete duplicate answer`
			`answers = [item['answer'] for item in ann.pop('answers')]`
			`count = Counter(answers)`
			`answer_weight = [i / len(answers) for i in count.values()]`
			`data_info['gt_answer'] = list(count.keys())`
			`data_info['gt_answer_weight'] = answer_weight`
			`data_info.update(ann)`

			`data_list.append(data_info)`

			`return data_list`