add instructblip

2025-06-03 06:40:40 +08:00 · 2023-07-02 21:00:05 +08:00 · 2023-07-02 21:00:05 +08:00 · 8a33a6aa2d
commit 8a33a6aa2d
parent f680f4ef30
3 changed files with 1 additions and 25 deletions
--- a/configs/instructblip/README.md
+++ b/configs/instructblip/README.md
@ -24,7 +24,7 @@ from mmpretrain import inference_model

 result = inference_model('instructblip-vicuna7b_3rdparty-zeroshot_caption', 'demo/cat-dog.png')
 print(result)
-# {'pred_caption': 'The image is a photograph of a beautiful garden. The garden is full of colorful flowers and green leaves.'}
+# {'pred_caption': 'a blanket next to each other in the grass\na cute puppy and kitten wallpapers'}
 ```

 <!-- [TABS-END] -->
--- a/mmpretrain/models/multimodal/instructblip/instructblip_caption.py
+++ b/mmpretrain/models/multimodal/instructblip/instructblip_caption.py
@ -46,7 +46,6 @@ class InstructBlipCaption(BaseModel):
                 max_txt_len: int = 256,
                 end_sym: str = '\n',
                 num_captions: int = 1,
-                 generation_cfg: dict = dict(),
                 qformer_text_input = True,
                 data_preprocessor: Optional[dict] = None,
                 init_cfg: Optional[dict] = None) -> None:
@ -121,19 +120,6 @@ class InstructBlipCaption(BaseModel):
        self.prompt_length = prompt_tokens.attention_mask.sum(1)
        self.qformer_text_input = qformer_text_input

-
-        # update generation configs
-        self.generation_cfg = dict(
-            max_new_tokens=300,
-            num_beams=1,
-            do_sample=True,
-            min_length=1,
-            top_p=0.9,
-            repetition_penalty=1.0,
-            length_penalty=1.0,
-            temperature=1.0,
-            **generation_cfg)
-
        if hasattr(self, 'register_load_state_dict_post_hook'):
            self.register_load_state_dict_post_hook(self._ignore_llm_keys_hook)

@ -233,9 +219,6 @@ class InstructBlipCaption(BaseModel):
        attns_llama = torch.ones(
            inputs_llama.size()[:-1], dtype=torch.long).to(images.device)

-
-# *******************************************************************?
-
        llama_tokens = self.llm_tokenizer(
            prompt, 
            padding="longest",
--- a/test.py
+++ b/test.py
@ -2,11 +2,4 @@ from mmpretrain import inference_model

 result = inference_model('instructblip-vicuna7b_3rdparty-zeroshot_caption', 'demo/cat-dog.png')
 print(result)
-# {'pred_caption': 'This image shows a small dog and a kitten sitting on a blanket in a field of flowers. The dog is looking up at the kitten with a playful expression on its face. The background is a colorful striped blanket, and there are flowers all around them. The image is well composed with the two animals sitting in the center of the frame, surrounded by the flowers and blanket.'}
-
-
-# from mmpretrain import inference_model
-
-# result = inference_model('minigpt-4_vicuna-7b_caption', 'demo/cat-dog.png')
-# print(result)
 # {'pred_caption': 'This image shows a small dog and a kitten sitting on a blanket in a field of flowers. The dog is looking up at the kitten with a playful expression on its face. The background is a colorful striped blanket, and there are flowers all around them. The image is well composed with the two animals sitting in the center of the frame, surrounded by the flowers and blanket.'}