mmclassification/mmpretrain/visualization/visualizer.py

778 lines
33 KiB
Python
Raw Normal View History

# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Sequence, Tuple, Union
import mmcv
import numpy as np
import torch
import torch.nn.functional as F
from mmengine.dataset import BaseDataset
from mmengine.dist import master_only
from mmengine.visualization import Visualizer
from mmengine.visualization.utils import img_from_canvas
from mmpretrain.registry import VISUALIZERS
from mmpretrain.structures import DataSample
from .utils import create_figure, get_adaptive_scale
@VISUALIZERS.register_module()
class UniversalVisualizer(Visualizer):
"""Universal Visualizer for multiple tasks.
Args:
name (str): Name of the instance. Defaults to 'visualizer'.
image (np.ndarray, optional): the origin image to draw. The format
should be RGB. Defaults to None.
vis_backends (list, optional): Visual backend config list.
Defaults to None.
save_dir (str, optional): Save file dir for all storage backends.
If it is None, the backend storage will not save any data.
fig_save_cfg (dict): Keyword parameters of figure for saving.
Defaults to empty dict.
fig_show_cfg (dict): Keyword parameters of figure for showing.
Defaults to empty dict.
"""
DEFAULT_TEXT_CFG = {
'family': 'monospace',
'color': 'white',
'bbox': dict(facecolor='black', alpha=0.5, boxstyle='Round'),
'verticalalignment': 'top',
'horizontalalignment': 'left',
}
@master_only
def visualize_cls(self,
image: np.ndarray,
data_sample: DataSample,
classes: Optional[Sequence[str]] = None,
draw_gt: bool = True,
draw_pred: bool = True,
draw_score: bool = True,
resize: Optional[int] = None,
rescale_factor: Optional[float] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: str = '',
step: int = 0) -> None:
"""Visualize image classification result.
This method will draw an text box on the input image to visualize the
information about image classification, like the ground-truth label and
prediction label.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
classes (Sequence[str], optional): The categories names.
Defaults to None.
draw_gt (bool): Whether to draw ground-truth labels.
Defaults to True.
draw_pred (bool): Whether to draw prediction labels.
Defaults to True.
draw_score (bool): Whether to draw the prediction scores
of prediction categories. Defaults to True.
resize (int, optional): Resize the short edge of the image to the
specified length before visualization. Defaults to None.
rescale_factor (float, optional): Rescale the image by the rescale
factor before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts
arguments of :meth:`mmengine.Visualizer.draw_texts`.
Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
if self.dataset_meta is not None:
classes = classes or self.dataset_meta.get('classes', None)
if resize is not None:
h, w = image.shape[:2]
if w < h:
[Feature] Support multiple multi-modal algorithms and inferencers. (#1561) * [Feat] Migrate blip caption to mmpretrain. (#50) * Migrate blip caption to mmpretrain * minor fix * support train * [Feature] Support OFA caption task. (#51) * [Feature] Support OFA caption task. * Remove duplicated files. * [Feature] Support OFA vqa task. (#58) * [Feature] Support OFA vqa task. * Fix lint. * [Feat] Add BLIP retrieval to mmpretrain. (#55) * init * minor fix for train * fix according to comments * refactor * Update Blip retrieval. (#62) * [Feature] Support OFA visual grounding task. (#59) * [Feature] Support OFA visual grounding task. * minor add TODO --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Add flamingos coco caption and vqa. (#60) * first init * init flamingo coco * add vqa * minor fix * remove unnecessary modules * Update config * Use `ApplyToList`. --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 coco retrieval (#53) * [Feature]: Add blip2 retriever * [Feature]: Add blip2 all modules * [Feature]: Refine model * [Feature]: x1 * [Feature]: Runnable coco ret * [Feature]: Runnable version * [Feature]: Fix lint * [Fix]: Fix lint * [Feature]: Use 364 img size * [Feature]: Refactor blip2 * [Fix]: Fix lint * refactor files * minor fix * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Remove * fix blip caption inputs (#68) * [Feat] Add BLIP NLVR support. (#67) * first init * init flamingo coco * add vqa * add nlvr * refactor nlvr * minor fix * minor fix * Update dataset --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 Caption (#70) * [Feature]: Add language model * [Feature]: blip2 caption forward * [Feature]: Reproduce the results * [Feature]: Refactor caption * refine config --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Migrate BLIP VQA to mmpretrain (#69) * reformat * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * refactor code --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Update RefCOCO dataset * [Fix] fix lint * [Feature] Implement inference APIs for multi-modal tasks. (#65) * [Feature] Implement inference APIs for multi-modal tasks. * [Project] Add gradio demo. * [Improve] Update requirements * Update flamingo * Update blip * Add NLVR inferencer * Update flamingo * Update hugging face model register * Update ofa vqa * Update BLIP-vqa (#71) * Update blip-vqa docstring (#72) * Refine flamingo docstring (#73) * [Feature]: BLIP2 VQA (#61) * [Feature]: VQA forward * [Feature]: Reproduce accuracy * [Fix]: Fix lint * [Fix]: Add blank line * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feature]: BLIP2 docstring (#74) * [Feature]: Add caption docstring * [Feature]: Add docstring to blip2 vqa * [Feature]: Add docstring to retrieval * Update BLIP-2 metafile and README (#75) * [Feature]: Add readme and docstring * Update blip2 results --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature] BLIP Visual Grounding on MMPretrain Branch (#66) * blip grounding merge with mmpretrain * remove commit * blip grounding test and inference api * refcoco dataset * refcoco dataset refine config * rebasing * gitignore * rebasing * minor edit * minor edit * Update blip-vqa docstring (#72) * rebasing * Revert "minor edit" This reverts commit 639cec757c215e654625ed0979319e60f0be9044. * blip grounding final * precommit * refine config * refine config * Update blip visual grounding --------- Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: mzr1996 <mzr1996@163.com> * Update visual grounding metric * Update OFA docstring, README and metafiles. (#76) * [Docs] Update installation docs and gradio demo docs. (#77) * Update OFA name * Update Visual Grounding Visualizer * Integrate accelerate support * Fix imports. * Fix timm backbone * Update imports * Update README * Update circle ci * Update flamingo config * Add gradio demo README * [Feature]: Add scienceqa (#1571) * [Feature]: Add scienceqa * [Feature]: Change param name * Update docs * Update video --------- Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> Co-authored-by: yingfhu <yingfhu@gmail.com> Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: Rongjie Li <limo97@163.com>
2023-05-19 16:50:04 +08:00
image = mmcv.imresize(image, (resize, resize * h // w))
else:
[Feature] Support multiple multi-modal algorithms and inferencers. (#1561) * [Feat] Migrate blip caption to mmpretrain. (#50) * Migrate blip caption to mmpretrain * minor fix * support train * [Feature] Support OFA caption task. (#51) * [Feature] Support OFA caption task. * Remove duplicated files. * [Feature] Support OFA vqa task. (#58) * [Feature] Support OFA vqa task. * Fix lint. * [Feat] Add BLIP retrieval to mmpretrain. (#55) * init * minor fix for train * fix according to comments * refactor * Update Blip retrieval. (#62) * [Feature] Support OFA visual grounding task. (#59) * [Feature] Support OFA visual grounding task. * minor add TODO --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Add flamingos coco caption and vqa. (#60) * first init * init flamingo coco * add vqa * minor fix * remove unnecessary modules * Update config * Use `ApplyToList`. --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 coco retrieval (#53) * [Feature]: Add blip2 retriever * [Feature]: Add blip2 all modules * [Feature]: Refine model * [Feature]: x1 * [Feature]: Runnable coco ret * [Feature]: Runnable version * [Feature]: Fix lint * [Fix]: Fix lint * [Feature]: Use 364 img size * [Feature]: Refactor blip2 * [Fix]: Fix lint * refactor files * minor fix * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Remove * fix blip caption inputs (#68) * [Feat] Add BLIP NLVR support. (#67) * first init * init flamingo coco * add vqa * add nlvr * refactor nlvr * minor fix * minor fix * Update dataset --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 Caption (#70) * [Feature]: Add language model * [Feature]: blip2 caption forward * [Feature]: Reproduce the results * [Feature]: Refactor caption * refine config --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Migrate BLIP VQA to mmpretrain (#69) * reformat * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * refactor code --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Update RefCOCO dataset * [Fix] fix lint * [Feature] Implement inference APIs for multi-modal tasks. (#65) * [Feature] Implement inference APIs for multi-modal tasks. * [Project] Add gradio demo. * [Improve] Update requirements * Update flamingo * Update blip * Add NLVR inferencer * Update flamingo * Update hugging face model register * Update ofa vqa * Update BLIP-vqa (#71) * Update blip-vqa docstring (#72) * Refine flamingo docstring (#73) * [Feature]: BLIP2 VQA (#61) * [Feature]: VQA forward * [Feature]: Reproduce accuracy * [Fix]: Fix lint * [Fix]: Add blank line * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feature]: BLIP2 docstring (#74) * [Feature]: Add caption docstring * [Feature]: Add docstring to blip2 vqa * [Feature]: Add docstring to retrieval * Update BLIP-2 metafile and README (#75) * [Feature]: Add readme and docstring * Update blip2 results --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature] BLIP Visual Grounding on MMPretrain Branch (#66) * blip grounding merge with mmpretrain * remove commit * blip grounding test and inference api * refcoco dataset * refcoco dataset refine config * rebasing * gitignore * rebasing * minor edit * minor edit * Update blip-vqa docstring (#72) * rebasing * Revert "minor edit" This reverts commit 639cec757c215e654625ed0979319e60f0be9044. * blip grounding final * precommit * refine config * refine config * Update blip visual grounding --------- Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: mzr1996 <mzr1996@163.com> * Update visual grounding metric * Update OFA docstring, README and metafiles. (#76) * [Docs] Update installation docs and gradio demo docs. (#77) * Update OFA name * Update Visual Grounding Visualizer * Integrate accelerate support * Fix imports. * Fix timm backbone * Update imports * Update README * Update circle ci * Update flamingo config * Add gradio demo README * [Feature]: Add scienceqa (#1571) * [Feature]: Add scienceqa * [Feature]: Change param name * Update docs * Update video --------- Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> Co-authored-by: yingfhu <yingfhu@gmail.com> Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: Rongjie Li <limo97@163.com>
2023-05-19 16:50:04 +08:00
image = mmcv.imresize(image, (resize * w // h, resize))
elif rescale_factor is not None:
image = mmcv.imrescale(image, rescale_factor)
texts = []
self.set_image(image)
if draw_gt and 'gt_label' in data_sample:
idx = data_sample.gt_label.tolist()
class_labels = [''] * len(idx)
if classes is not None:
class_labels = [f' ({classes[i]})' for i in idx]
labels = [str(idx[i]) + class_labels[i] for i in range(len(idx))]
prefix = 'Ground truth: '
texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels))
if draw_pred and 'pred_label' in data_sample:
idx = data_sample.pred_label.tolist()
score_labels = [''] * len(idx)
class_labels = [''] * len(idx)
if draw_score and 'pred_score' in data_sample:
score_labels = [
f', {data_sample.pred_score[i].item():.2f}' for i in idx
]
if classes is not None:
class_labels = [f' ({classes[i]})' for i in idx]
labels = [
str(idx[i]) + score_labels[i] + class_labels[i]
for i in range(len(idx))
]
prefix = 'Prediction: '
texts.append(prefix + ('\n' + ' ' * len(prefix)).join(labels))
img_scale = get_adaptive_scale(image.shape[:2])
text_cfg = {
'size': int(img_scale * 7),
**self.DEFAULT_TEXT_CFG,
**text_cfg,
}
self.ax_save.text(
img_scale * 5,
img_scale * 5,
'\n'.join(texts),
**text_cfg,
)
drawn_img = self.get_image()
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
@master_only
def visualize_image_retrieval(self,
image: np.ndarray,
data_sample: DataSample,
prototype_dataset: BaseDataset,
topk: int = 1,
draw_score: bool = True,
resize: Optional[int] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: Optional[str] = '',
step: int = 0) -> None:
"""Visualize image retrieval result.
This method will draw the input image and the images retrieved from the
prototype dataset.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
prototype_dataset (:obj:`BaseDataset`): The prototype dataset.
It should have `get_data_info` method and return a dict
includes `img_path`.
draw_score (bool): Whether to draw the match scores of the
retrieved images. Defaults to True.
resize (int, optional): Resize the long edge of the image to the
specified length before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts arguments of
:func:`plt.text`. Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
text_cfg = {**self.DEFAULT_TEXT_CFG, **text_cfg}
if resize is not None:
image = mmcv.imrescale(image, (resize, resize))
match_scores, indices = torch.topk(data_sample.pred_score, k=topk)
figure = create_figure(margin=True)
gs = figure.add_gridspec(2, topk)
query_plot = figure.add_subplot(gs[0, :])
query_plot.axis(False)
query_plot.imshow(image)
for k, (score, sample_idx) in enumerate(zip(match_scores, indices)):
sample = prototype_dataset.get_data_info(sample_idx.item())
value_image = mmcv.imread(sample['img_path'])[..., ::-1]
value_plot = figure.add_subplot(gs[1, k])
value_plot.axis(False)
value_plot.imshow(value_image)
if draw_score:
value_plot.text(
5,
5,
f'{score:.2f}',
**text_cfg,
)
drawn_img = img_from_canvas(figure.canvas)
self.set_image(drawn_img)
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
def add_mask_to_image(
self,
image: np.ndarray,
data_sample: DataSample,
resize: Union[int, Tuple[int]] = 224,
color: Union[str, Tuple[int]] = 'black',
alpha: Union[int, float] = 0.8,
) -> np.ndarray:
if isinstance(resize, int):
resize = (resize, resize)
image = mmcv.imresize(image, resize)
self.set_image(image)
if isinstance(data_sample.mask, np.ndarray):
data_sample.mask = torch.tensor(data_sample.mask)
mask = data_sample.mask.float()[None, None, ...]
mask_ = F.interpolate(mask, image.shape[:2], mode='nearest')[0, 0]
self.draw_binary_masks(mask_.bool(), colors=color, alphas=alpha)
drawn_img = self.get_image()
return drawn_img
@master_only
def visualize_masked_image(self,
image: np.ndarray,
data_sample: DataSample,
resize: Union[int, Tuple[int]] = 224,
color: Union[str, Tuple[int]] = 'black',
alpha: Union[int, float] = 0.8,
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: str = '',
step: int = 0) -> None:
"""Visualize masked image.
This method will draw an image with binary mask.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
resize (int | Tuple[int]): Resize the input image to the specified
shape. Defaults to 224.
color (str | Tuple[int]): The color of the binary mask.
Defaults to "black".
alpha (int | float): The transparency of the mask. Defaults to 0.8.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
drawn_img = self.add_mask_to_image(
image=image,
data_sample=data_sample,
resize=resize,
color=color,
alpha=alpha)
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
[Feature] Support multiple multi-modal algorithms and inferencers. (#1561) * [Feat] Migrate blip caption to mmpretrain. (#50) * Migrate blip caption to mmpretrain * minor fix * support train * [Feature] Support OFA caption task. (#51) * [Feature] Support OFA caption task. * Remove duplicated files. * [Feature] Support OFA vqa task. (#58) * [Feature] Support OFA vqa task. * Fix lint. * [Feat] Add BLIP retrieval to mmpretrain. (#55) * init * minor fix for train * fix according to comments * refactor * Update Blip retrieval. (#62) * [Feature] Support OFA visual grounding task. (#59) * [Feature] Support OFA visual grounding task. * minor add TODO --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Add flamingos coco caption and vqa. (#60) * first init * init flamingo coco * add vqa * minor fix * remove unnecessary modules * Update config * Use `ApplyToList`. --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 coco retrieval (#53) * [Feature]: Add blip2 retriever * [Feature]: Add blip2 all modules * [Feature]: Refine model * [Feature]: x1 * [Feature]: Runnable coco ret * [Feature]: Runnable version * [Feature]: Fix lint * [Fix]: Fix lint * [Feature]: Use 364 img size * [Feature]: Refactor blip2 * [Fix]: Fix lint * refactor files * minor fix * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Remove * fix blip caption inputs (#68) * [Feat] Add BLIP NLVR support. (#67) * first init * init flamingo coco * add vqa * add nlvr * refactor nlvr * minor fix * minor fix * Update dataset --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature]: BLIP2 Caption (#70) * [Feature]: Add language model * [Feature]: blip2 caption forward * [Feature]: Reproduce the results * [Feature]: Refactor caption * refine config --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feat] Migrate BLIP VQA to mmpretrain (#69) * reformat * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * change * refactor code --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * Update RefCOCO dataset * [Fix] fix lint * [Feature] Implement inference APIs for multi-modal tasks. (#65) * [Feature] Implement inference APIs for multi-modal tasks. * [Project] Add gradio demo. * [Improve] Update requirements * Update flamingo * Update blip * Add NLVR inferencer * Update flamingo * Update hugging face model register * Update ofa vqa * Update BLIP-vqa (#71) * Update blip-vqa docstring (#72) * Refine flamingo docstring (#73) * [Feature]: BLIP2 VQA (#61) * [Feature]: VQA forward * [Feature]: Reproduce accuracy * [Fix]: Fix lint * [Fix]: Add blank line * minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com> * [Feature]: BLIP2 docstring (#74) * [Feature]: Add caption docstring * [Feature]: Add docstring to blip2 vqa * [Feature]: Add docstring to retrieval * Update BLIP-2 metafile and README (#75) * [Feature]: Add readme and docstring * Update blip2 results --------- Co-authored-by: mzr1996 <mzr1996@163.com> * [Feature] BLIP Visual Grounding on MMPretrain Branch (#66) * blip grounding merge with mmpretrain * remove commit * blip grounding test and inference api * refcoco dataset * refcoco dataset refine config * rebasing * gitignore * rebasing * minor edit * minor edit * Update blip-vqa docstring (#72) * rebasing * Revert "minor edit" This reverts commit 639cec757c215e654625ed0979319e60f0be9044. * blip grounding final * precommit * refine config * refine config * Update blip visual grounding --------- Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: mzr1996 <mzr1996@163.com> * Update visual grounding metric * Update OFA docstring, README and metafiles. (#76) * [Docs] Update installation docs and gradio demo docs. (#77) * Update OFA name * Update Visual Grounding Visualizer * Integrate accelerate support * Fix imports. * Fix timm backbone * Update imports * Update README * Update circle ci * Update flamingo config * Add gradio demo README * [Feature]: Add scienceqa (#1571) * [Feature]: Add scienceqa * [Feature]: Change param name * Update docs * Update video --------- Co-authored-by: Hubert <42952108+yingfhu@users.noreply.github.com> Co-authored-by: yingfhu <yingfhu@gmail.com> Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Co-authored-by: Yiqin Wang 王逸钦 <wyq1217@outlook.com> Co-authored-by: Rongjie Li <limo97@163.com>
2023-05-19 16:50:04 +08:00
@master_only
def visualize_image_caption(self,
image: np.ndarray,
data_sample: DataSample,
resize: Optional[int] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: Optional[str] = '',
step: int = 0) -> None:
"""Visualize image caption result.
This method will draw the input image and the images caption.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
resize (int, optional): Resize the long edge of the image to the
specified length before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts arguments of
:func:`plt.text`. Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
text_cfg = {**self.DEFAULT_TEXT_CFG, **text_cfg}
if resize is not None:
h, w = image.shape[:2]
if w < h:
image = mmcv.imresize(image, (resize, resize * h // w))
else:
image = mmcv.imresize(image, (resize * w // h, resize))
self.set_image(image)
img_scale = get_adaptive_scale(image.shape[:2])
text_cfg = {
'size': int(img_scale * 7),
**self.DEFAULT_TEXT_CFG,
**text_cfg,
}
self.ax_save.text(
img_scale * 5,
img_scale * 5,
data_sample.get('pred_caption'),
wrap=True,
**text_cfg,
)
drawn_img = self.get_image()
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
@master_only
def visualize_vqa(self,
image: np.ndarray,
data_sample: DataSample,
resize: Optional[int] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: Optional[str] = '',
step: int = 0) -> None:
"""Visualize visual question answering result.
This method will draw the input image, question and answer.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
resize (int, optional): Resize the long edge of the image to the
specified length before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts arguments of
:func:`plt.text`. Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
text_cfg = {**self.DEFAULT_TEXT_CFG, **text_cfg}
if resize is not None:
h, w = image.shape[:2]
if w < h:
image = mmcv.imresize(image, (resize, resize * h // w))
else:
image = mmcv.imresize(image, (resize * w // h, resize))
self.set_image(image)
img_scale = get_adaptive_scale(image.shape[:2])
text_cfg = {
'size': int(img_scale * 7),
**self.DEFAULT_TEXT_CFG,
**text_cfg,
}
text = (f'Q: {data_sample.get("question")}\n'
f'A: {data_sample.get("pred_answer")}')
self.ax_save.text(
img_scale * 5,
img_scale * 5,
text,
wrap=True,
**text_cfg,
)
drawn_img = self.get_image()
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
@master_only
def visualize_visual_grounding(self,
image: np.ndarray,
data_sample: DataSample,
resize: Optional[int] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: Optional[str] = '',
line_width: Union[int, float] = 3,
bbox_color: Union[str, tuple] = 'green',
step: int = 0) -> None:
"""Visualize visual grounding result.
This method will draw the input image, bbox and the object.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
resize (int, optional): Resize the long edge of the image to the
specified length before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts arguments of
:func:`plt.text`. Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
text_cfg = {**self.DEFAULT_TEXT_CFG, **text_cfg}
gt_bboxes = data_sample.get('gt_bboxes')
pred_bboxes = data_sample.get('pred_bboxes')
if resize is not None:
h, w = image.shape[:2]
if w < h:
image, w_scale, h_scale = mmcv.imresize(
image, (resize, resize * h // w), return_scale=True)
else:
image, w_scale, h_scale = mmcv.imresize(
image, (resize * w // h, resize), return_scale=True)
pred_bboxes[:, ::2] *= w_scale
pred_bboxes[:, 1::2] *= h_scale
if gt_bboxes is not None:
gt_bboxes[:, ::2] *= w_scale
gt_bboxes[:, 1::2] *= h_scale
self.set_image(image)
# Avoid the line-width limit in the base classes.
self._default_font_size = 1e3
self.draw_bboxes(
pred_bboxes, line_widths=line_width, edge_colors=bbox_color)
if gt_bboxes is not None:
self.draw_bboxes(
gt_bboxes, line_widths=line_width, edge_colors='blue')
img_scale = get_adaptive_scale(image.shape[:2])
text_cfg = {
'size': int(img_scale * 7),
**self.DEFAULT_TEXT_CFG,
**text_cfg,
}
text_positions = pred_bboxes[:, :2] + line_width
for i in range(pred_bboxes.size(0)):
self.ax_save.text(
text_positions[i, 0] + line_width,
text_positions[i, 1] + line_width,
data_sample.get('text'),
**text_cfg,
)
drawn_img = self.get_image()
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
@master_only
def visualize_t2i_retrieval(self,
text: str,
data_sample: DataSample,
prototype_dataset: BaseDataset,
topk: int = 1,
draw_score: bool = True,
text_cfg: dict = dict(),
fig_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: Optional[str] = '',
step: int = 0) -> None:
"""Visualize Text-To-Image retrieval result.
This method will draw the input text and the images retrieved from the
prototype dataset.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
prototype_dataset (:obj:`BaseDataset`): The prototype dataset.
It should have `get_data_info` method and return a dict
includes `img_path`.
topk (int): To visualize the topk matching items. Defaults to 1.
draw_score (bool): Whether to draw the match scores of the
retrieved images. Defaults to True.
text_cfg (dict): Extra text setting, which accepts arguments of
:func:`plt.text`. Defaults to an empty dict.
fig_cfg (dict): Extra figure setting, which accepts arguments of
:func:`plt.Figure`. Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
text_cfg = {**self.DEFAULT_TEXT_CFG, **text_cfg}
match_scores, indices = torch.topk(data_sample.pred_score, k=topk)
figure = create_figure(margin=True, **fig_cfg)
figure.suptitle(text)
gs = figure.add_gridspec(1, topk)
for k, (score, sample_idx) in enumerate(zip(match_scores, indices)):
sample = prototype_dataset.get_data_info(sample_idx.item())
value_image = mmcv.imread(sample['img_path'])[..., ::-1]
value_plot = figure.add_subplot(gs[0, k])
value_plot.axis(False)
value_plot.imshow(value_image)
if draw_score:
value_plot.text(
5,
5,
f'{score:.2f}',
**text_cfg,
)
drawn_img = img_from_canvas(figure.canvas)
self.set_image(drawn_img)
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img
@master_only
def visualize_i2t_retrieval(self,
image: np.ndarray,
data_sample: DataSample,
prototype_dataset: Sequence[str],
topk: int = 1,
draw_score: bool = True,
resize: Optional[int] = None,
text_cfg: dict = dict(),
show: bool = False,
wait_time: float = 0,
out_file: Optional[str] = None,
name: str = '',
step: int = 0) -> None:
"""Visualize Image-To-Text retrieval result.
This method will draw the input image and the texts retrieved from the
prototype dataset.
Args:
image (np.ndarray): The image to draw. The format should be RGB.
data_sample (:obj:`DataSample`): The annotation of the image.
prototype_dataset (Sequence[str]): The prototype dataset.
It should be a list of texts.
topk (int): To visualize the topk matching items. Defaults to 1.
draw_score (bool): Whether to draw the prediction scores
of prediction categories. Defaults to True.
resize (int, optional): Resize the short edge of the image to the
specified length before visualization. Defaults to None.
text_cfg (dict): Extra text setting, which accepts
arguments of :meth:`mmengine.Visualizer.draw_texts`.
Defaults to an empty dict.
show (bool): Whether to display the drawn image in a window, please
confirm your are able to access the graphical interface.
Defaults to False.
wait_time (float): The display time (s). Defaults to 0, which means
"forever".
out_file (str, optional): Extra path to save the visualization
result. If specified, the visualizer will only save the result
image to the out_file and ignore its storage backends.
Defaults to None.
name (str): The image identifier. It's useful when using the
storage backends of the visualizer to save or display the
image. Defaults to an empty string.
step (int): The global step value. It's useful to record a
series of visualization results for the same image with the
storage backends. Defaults to 0.
Returns:
np.ndarray: The visualization image.
"""
if resize is not None:
h, w = image.shape[:2]
if w < h:
image = mmcv.imresize(image, (resize, resize * h // w))
else:
image = mmcv.imresize(image, (resize * w // h, resize))
self.set_image(image)
match_scores, indices = torch.topk(data_sample.pred_score, k=topk)
texts = []
for score, sample_idx in zip(match_scores, indices):
text = prototype_dataset[sample_idx.item()]
if draw_score:
text = f'{score:.2f} ' + text
texts.append(text)
img_scale = get_adaptive_scale(image.shape[:2])
text_cfg = {
'size': int(img_scale * 7),
**self.DEFAULT_TEXT_CFG,
**text_cfg,
}
self.ax_save.text(
img_scale * 5,
img_scale * 5,
'\n'.join(texts),
**text_cfg,
)
drawn_img = self.get_image()
if show:
self.show(drawn_img, win_name=name, wait_time=wait_time)
if out_file is not None:
# save the image to the target file instead of vis_backends
mmcv.imwrite(drawn_img[..., ::-1], out_file)
else:
self.add_image(name, drawn_img, step=step)
return drawn_img