mmclassification/mmpretrain/models/multimodal/flamingo/adapter.py

# Copyright (c) OpenMMLab. All rights reserved.
import random

import torch.nn as nn

from mmpretrain.registry import MODELS
from .modules import FlamingoLayer, GatedCrossAttentionBlock
from .utils import getattr_recursive, setattr_recursive


@MODELS.register_module()
class FlamingoLMAdapter:
    """Mixin to add cross-attention layers to a language model."""

    @classmethod
    def extend_init(
        cls,
        base: object,
        vis_hidden_size: int,
        cross_attn_every_n_layers: int,
        use_media_placement_augmentation: bool,
    ):
        """Initialize Flamingo by adding a new gated cross attn to the decoder.

        Store the media token id for computing the media locations.

        Args:
            base (object): Base module could be any object that represent
                a instance of language model.
            vis_hidden_size: (int): Hidden size of vision embeddings.
            cross_attn_every_n_layers: (int): Additional cross attn for
                every n layers.
            use_media_placement_augmentation: (bool): Whether to use media
                placement augmentation.
        """
        base.set_decoder_layers_attr_name('model.layers')
        gated_cross_attn_layers = nn.ModuleList([
            GatedCrossAttentionBlock(
                dim=base.config.hidden_size, dim_visual=vis_hidden_size) if
            (layer_idx + 1) % cross_attn_every_n_layers == 0 else None
            for layer_idx, _ in enumerate(base._get_decoder_layers())
        ])
        base._set_decoder_layers(
            nn.ModuleList([
                FlamingoLayer(gated_cross_attn_layer, decoder_layer)
                for gated_cross_attn_layer, decoder_layer in zip(
                    gated_cross_attn_layers, base._get_decoder_layers())
            ]))
        base.use_media_placement_augmentation = use_media_placement_augmentation  # noqa
        base.initialized_flamingo = True
        return base

    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
        """Set decoder layers attribute name."""
        self.decoder_layers_attr_name = decoder_layers_attr_name

    def _get_decoder_layers(self):
        """Get decoder layers according to attribute name."""
        return getattr_recursive(self, self.decoder_layers_attr_name)

    def _set_decoder_layers(self, value):
        """Set decoder layers according to attribute name."""
        setattr_recursive(self, self.decoder_layers_attr_name, value)

    def forward(self, *input, **kwargs):
        """Condition the Flamingo layers on the media locations before forward
        function."""
        input_ids = kwargs['input_ids'] if 'input_ids' in kwargs else input[0]
        media_locations = input_ids == self.media_token_id
        attend_previous = ((random.random() < 0.5)
                           if self.use_media_placement_augmentation else False)

        for layer in self.get_decoder().layers:
            layer.condition_media_locations(media_locations)
            layer.condition_attend_previous(attend_previous)

        return super().forward(
            *input, **kwargs)  # Call the other parent's forward method

    def is_conditioned(self) -> bool:
        """Check whether all decoder layers are already conditioned."""
        return all(layer.is_conditioned()
                   for layer in self._get_decoder_layers())

    def clear_conditioned_layers(self):
        """Clear all conditional layers."""
        for layer in self._get_decoder_layers():
            layer.condition_vis_x(None)
            layer.condition_media_locations(None)
            layer.condition_attend_previous(None)
[Feature] Support multiple multi-modal algorithms and inferencers. (#1561) * [Feat] Migrate blip caption to mmpretrain. (#50) * Migrate blip caption to mmpretrain * minor fix * support train * [Feature] Support OFA caption task. (#51) * [Feature] Support OFA caption task. * Remove duplicated files. * [Feature] Support OFA vqa task. (#58) * [Feature] Support OFA vqa task. * Fix lint. * [Feat] Add BLIP retrieval to mmpretrain. (#55) * init * minor fix for train * fix according to comments * refactor * Update Blip retrieval. (#62) * [Feature] Support OFA visual grounding task. (#59) * [Feature] Support OFA visual grounding task. * minor add TODO --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Add flamingos coco caption and vqa. (#60) * first init * init flamingo coco * add vqa * minor fix * remove unnecessary modules * Update config * Use `ApplyToList`. --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 coco retrieval (#53) * [Feature]: Add blip2 retriever * [Feature]: Add blip2 all modules * [Feature]: Refine model * [Feature]: x1 * [Feature]: Runnable coco ret * [Feature]: Runnable version * [Feature]: Fix lint * [Fix]: Fix lint * [Feature]: Use 364 img size * [Feature]: Refactor blip2 * [Fix]: Fix lint * refactor files * minor fix * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Remove * fix blip caption inputs (#68) * [Feat] Add BLIP NLVR support. (#67) * first init * init flamingo coco * add vqa * add nlvr * refactor nlvr * minor fix * minor fix * Update dataset --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 Caption (#70) * [Feature]: Add language model * [Feature]: blip2 caption forward * [Feature]: Reproduce the results * [Feature]: Refactor caption * refine config --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Migrate BLIP VQA to mmpretrain (#69) * reformat * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * refactor code --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Update RefCOCO dataset * [Fix] fix lint * [Feature] Implement inference APIs for multi-modal tasks. (#65) * [Feature] Implement inference APIs for multi-modal tasks. * [Project] Add gradio demo. * [Improve] Update requirements * Update flamingo * Update blip * Add NLVR inferencer * Update flamingo * Update hugging face model register * Update ofa vqa * Update BLIP-vqa (#71) * Update blip-vqa docstring (#72) * Refine flamingo docstring (#73) * [Feature]: BLIP2 VQA (#61) * [Feature]: VQA forward * [Feature]: Reproduce accuracy * [Fix]: Fix lint * [Fix]: Add blank line * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feature]: BLIP2 docstring (#74) * [Feature]: Add caption docstring * [Feature]: Add docstring to blip2 vqa * [Feature]: Add docstring to retrieval * Update BLIP-2 metafile and README (#75) * [Feature]: Add readme and docstring * Update blip2 results --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature] BLIP Visual Grounding on MMPretrain Branch (#66) * blip grounding merge with mmpretrain * remove commit * blip grounding test and inference api * refcoco dataset * refcoco dataset refine config * rebasing * gitignore * rebasing * minor edit * minor edit * Update blip-vqa docstring (#72) * rebasing * Revert "minor edit" This reverts commit 639cec757c215e654625ed0979319e60f0be9044. * blip grounding final * precommit * refine config * refine config * Update blip visual grounding --------- Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: mzr1996 <mzr1996@163.com> * Update visual grounding metric * Update OFA docstring, README and metafiles. (#76) * [Docs] Update installation docs and gradio demo docs. (#77) * Update OFA name * Update Visual Grounding Visualizer * Integrate accelerate support * Fix imports. * Fix timm backbone * Update imports * Update README * Update circle ci * Update flamingo config * Add gradio demo README * [Feature]: Add scienceqa (#1571) * [Feature]: Add scienceqa * [Feature]: Change param name * Update docs * Update video --------- Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> Co-authored-by: yingfhu <yingfhu@gmail.com> Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: Rongjie Li <limo97@163.com> 2023-05-19 16:50:04 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
			`import random`

			`import torch.nn as nn`

			`from mmpretrain.registry import MODELS`
			`from .modules import FlamingoLayer, GatedCrossAttentionBlock`
			`from .utils import getattr_recursive, setattr_recursive`


			`@MODELS.register_module()`
			`class FlamingoLMAdapter:`
			`"""Mixin to add cross-attention layers to a language model."""`

			`@classmethod`
			`def extend_init(`
			`cls,`
			`base: object,`
			`vis_hidden_size: int,`
			`cross_attn_every_n_layers: int,`
			`use_media_placement_augmentation: bool,`
			`):`
			`"""Initialize Flamingo by adding a new gated cross attn to the decoder.`

			`Store the media token id for computing the media locations.`

			`Args:`
			`base (object): Base module could be any object that represent`
			`a instance of language model.`
			`vis_hidden_size: (int): Hidden size of vision embeddings.`
			`cross_attn_every_n_layers: (int): Additional cross attn for`
			`every n layers.`
			`use_media_placement_augmentation: (bool): Whether to use media`
			`placement augmentation.`
			`"""`
			`base.set_decoder_layers_attr_name('model.layers')`
			`gated_cross_attn_layers = nn.ModuleList([`
			`GatedCrossAttentionBlock(`
			`dim=base.config.hidden_size, dim_visual=vis_hidden_size) if`
			`(layer_idx + 1) % cross_attn_every_n_layers == 0 else None`
			`for layer_idx, _ in enumerate(base._get_decoder_layers())`
			`])`
			`base._set_decoder_layers(`
			`nn.ModuleList([`
			`FlamingoLayer(gated_cross_attn_layer, decoder_layer)`
			`for gated_cross_attn_layer, decoder_layer in zip(`
			`gated_cross_attn_layers, base._get_decoder_layers())`
			`]))`
			`base.use_media_placement_augmentation = use_media_placement_augmentation # noqa`
			`base.initialized_flamingo = True`
			`return base`

			`def set_decoder_layers_attr_name(self, decoder_layers_attr_name):`
			`"""Set decoder layers attribute name."""`
			`self.decoder_layers_attr_name = decoder_layers_attr_name`

			`def _get_decoder_layers(self):`
			`"""Get decoder layers according to attribute name."""`
			`return getattr_recursive(self, self.decoder_layers_attr_name)`

			`def _set_decoder_layers(self, value):`
			`"""Set decoder layers according to attribute name."""`
			`setattr_recursive(self, self.decoder_layers_attr_name, value)`

			`def forward(self, input, *kwargs):`
			`"""Condition the Flamingo layers on the media locations before forward`
			`function."""`
			`input_ids = kwargs['input_ids'] if 'input_ids' in kwargs else input[0]`
			`media_locations = input_ids == self.media_token_id`
			`attend_previous = ((random.random() < 0.5)`
			`if self.use_media_placement_augmentation else False)`

			`for layer in self.get_decoder().layers:`
			`layer.condition_media_locations(media_locations)`
			`layer.condition_attend_previous(attend_previous)`

			`return super().forward(`
			`input, *kwargs) # Call the other parent's forward method`

			`def is_conditioned(self) -> bool:`
			`"""Check whether all decoder layers are already conditioned."""`
			`return all(layer.is_conditioned()`
			`for layer in self._get_decoder_layers())`

			`def clear_conditioned_layers(self):`
			`"""Clear all conditional layers."""`
			`for layer in self._get_decoder_layers():`
			`layer.condition_vis_x(None)`
			`layer.condition_media_locations(None)`
			`layer.condition_attend_previous(None)`