uodate training code

pull/18/head
wjf5203 2024-03-19 10:31:00 +08:00
parent c16a4e7999
commit 7a021a077b
572 changed files with 75428 additions and 810 deletions

View File

@ -1,10 +1,4 @@
<div align=center>
<img src="assets/images/GLEE_logo.png" width=900 >
</div>
# GLEE: General Object Foundation Model for Images and Videos at Scale
> #### Junfeng Wu\*, Yi Jiang\*, Qihao Liu, Zehuan Yuan, Xiang Bai<sup>&dagger;</sup>,and Song Bai<sup>&dagger;</sup>
@ -13,37 +7,18 @@
\[[Project Page](https://glee-vision.github.io/)\] \[[Paper](https://arxiv.org/abs/2312.09158)\] \[[HuggingFace Demo](https://huggingface.co/spaces/Junfeng5/GLEE_demo)\] \[[Video Demo](https://youtu.be/PSVhfTPx0GQ)\]
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/long-tail-video-object-segmentation-on-burst-1)](https://paperswithcode.com/sota/long-tail-video-object-segmentation-on-burst-1?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-ovis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-ovis-1?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-video-object-segmentation-on-refer)](https://paperswithcode.com/sota/referring-video-object-segmentation-on-refer?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/multi-object-tracking-on-tao)](https://paperswithcode.com/sota/multi-object-tracking-on-tao?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/open-world-instance-segmentation-on-uvo)](https://paperswithcode.com/sota/open-world-instance-segmentation-on-uvo?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcocog)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-youtube-vis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-youtube-vis-1?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/object-detection-on-lvis-v1-0-val)](https://paperswithcode.com/sota/object-detection-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-lvis-v1-0-val)](https://paperswithcode.com/sota/instance-segmentation-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco-3)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-3?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on)](https://paperswithcode.com/sota/referring-expression-comprehension-on?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=general-object-foundation-model-for-images)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco-1)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco-1?p=general-object-foundation-model-for-images)
[![IMAGE ALT TEXT](http://img.youtube.com/vi/PSVhfTPx0GQ/0.jpg)](http://www.youtube.com/watch?v=PSVhfTPx0GQ "Video Title")
![data_demo](assets/images/data_demo.png)
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/long-tail-video-object-segmentation-on-burst-1)](https://paperswithcode.com/sota/long-tail-video-object-segmentation-on-burst-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-ovis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-ovis-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-video-object-segmentation-on-refer)](https://paperswithcode.com/sota/referring-video-object-segmentation-on-refer?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/multi-object-tracking-on-tao)](https://paperswithcode.com/sota/multi-object-tracking-on-tao?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/open-world-instance-segmentation-on-uvo)](https://paperswithcode.com/sota/open-world-instance-segmentation-on-uvo?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcocog)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-youtube-vis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-youtube-vis-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/object-detection-on-lvis-v1-0-val)](https://paperswithcode.com/sota/object-detection-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-lvis-v1-0-val)](https://paperswithcode.com/sota/instance-segmentation-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco-3)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-3?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on)](https://paperswithcode.com/sota/referring-expression-comprehension-on?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco-1)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco-1?p=general-object-foundation-model-for-images)
![data_demo](assets/images/glee_func.gif)
## Highlight:
- GLEE is accepted by **CVPR2024** !
- GLEE is a general object foundation model jointly trained on over **ten million images** from various benchmarks with diverse levels of supervision.
- GLEE is capable of addressing **a wide range of object-centric tasks** simultaneously while maintaining state-of-the-art performance.
- GLEE is capable of addressing **a wide range of object-centric tasks** simultaneously while maintaining **SOTA** performance.
- GLEE demonstrates remarkable versatility and robust **zero-shot transferability** across a spectrum of object-level image and video tasks, and able to **serve as a foundational component** for enhancing other architectures or models.
@ -51,10 +26,28 @@
We will release the following contents for **GLEE**:exclamation:
- [x] Demo Code
- [x] Model Checkpoint
- [ ] Comprehensive User Guide
- [ ] Training Code
- [ ] Evaluation Code
- [x] Model Zoo
- [x] Comprehensive User Guide
- [x] Training Code and Scripts
- [ ] Detailed Evaluation Code and Scripts
- [ ] Tutorial for Zero-shot Testing or Fine-tuning GLEE on New Datasets
## Getting started
1. Installation: Please refer to [INSTALL.md](assets/INSTALL.md) for more details.
2. Data preparation: Please refer to [DATA.md](assets/DATA.md) for more details.
3. Training: Please refer to [TRAIN.md](assets/TRAIN.md) for more details.
4. Testing: Please refer to [TEST.md](assets/TEST.md) for more details.
5. Model zoo: Please refer to [MODEL_ZOO.md](assets/MODEL_ZOO.md) for more details.
@ -64,13 +57,6 @@ Try our online demo app on \[[HuggingFace Demo](https://huggingface.co/spaces/Ju
```bash
git clone https://github.com/FoundationVision/GLEE
cd GLEE/app/
pip install -r requirements.txt
```
Download the pretrain weight for [GLEE-Lite](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE_R50_Scaleup10m.pth?download=true) and [GLEE-Plus](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE_SwinL_Scaleup10m.pth?download=true)
```
# support CPU and GPU running
python app.py
```
@ -79,6 +65,16 @@ python app.py
# Introduction
GLEE has been trained on over ten million images from 16 datasets, fully harnessing both existing annotated data and cost-effective automatically labeled data to construct a diverse training set. This extensive training regime endows GLEE with formidable generalization capabilities.
![data_demo](assets/images/data_demo.png)
GLEE consists of an image encoder, a text encoder, a visual prompter, and an object decoder, as illustrated in Figure. The text encoder processes arbitrary descriptions related to the task, including **1) object category list 2object names in any form 3captions about objects 4referring expressions**. The visual prompter encodes user inputs such as **1) points 2) bounding boxes 3) scribbles** during interactive segmentation into corresponding visual representations of target objects. Then they are integrated into a detector for extracting objects from images according to textual and visual input.
![pipeline](assets/images/pipeline.png)
@ -117,3 +113,14 @@ Based on the above designs, GLEE can be used to seamlessly unify a wide range of
}
```
## Acknowledgments
- Thanks [UNINEXT](https://github.com/MasterBin-IIAU/UNINEXT) for the implementation of multi-dataset training and data processing.
- Thanks [VNext](https://github.com/wjf5203/VNext) for providing experience of Video Instance Segmentation (VIS).
- Thanks [SEEM](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once) for providing the implementation of the visual prompter.
- Thanks [MaskDINO](https://github.com/IDEA-Research/MaskDINO) for providing a powerful detector and segmenter.

View File

@ -12,13 +12,13 @@ import cv2
import torch
from detectron2.config import get_cfg
from GLEE.glee.models.glee_model import GLEE_Model
from GLEE.glee.config_deeplab import add_deeplab_config
from GLEE.glee.config import add_glee_config
from projects.GLEE.glee.models.glee_model import GLEE_Model
from projects.GLEE.glee.config import add_glee_config
# from projects.GLEE import GLEE
import torch.nn.functional as F
import torchvision
import math
from obj365_name import categories as OBJ365_CATEGORIESV2
from projects.GLEE.glee.data.datasets.objects365_v2 import categories as OBJ365_CATEGORIESV2
print(f"Is CUDA available: {torch.cuda.is_available()}")
@ -80,10 +80,9 @@ else:
device='cpu'
cfg_r50 = get_cfg()
add_deeplab_config(cfg_r50)
add_glee_config(cfg_r50)
conf_files_r50 = 'GLEE/configs/R50.yaml'
checkpoints_r50 = torch.load('GLEE_R50_Scaleup10m.pth')
conf_files_r50 = 'projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml'
checkpoints_r50 = torch.load('GLEE_Lite_joint.pth')
cfg_r50.merge_from_file(conf_files_r50)
GLEEmodel_r50 = GLEE_Model(cfg_r50, None, device, None, True).to(device)
GLEEmodel_r50.load_state_dict(checkpoints_r50, strict=False)
@ -91,10 +90,9 @@ GLEEmodel_r50.eval()
cfg_swin = get_cfg()
add_deeplab_config(cfg_swin)
add_glee_config(cfg_swin)
conf_files_swin = 'GLEE/configs/SwinL.yaml'
checkpoints_swin = torch.load('GLEE_SwinL_Scaleup10m.pth')
conf_files_swin = 'projects/GLEE/configs/images/Plus/Stage2_joint_training_CLIPteacher_SwinL.yaml'
checkpoints_swin = torch.load('GLEE_Plus_joint.pth')
cfg_swin.merge_from_file(conf_files_swin)
GLEEmodel_swin = GLEE_Model(cfg_swin, None, device, None, True).to(device)
GLEEmodel_swin.load_state_dict(checkpoints_swin, strict=False)
@ -157,7 +155,7 @@ def segment_image(img,prompt_mode, categoryname, custom_category, expressiong, r
if categoryname =="COCO-80":
batch_category_name = coco_class_name
elif categoryname =="OBJ365":
batch_category_name = obj365_class_name
batch_category_name = OBJ365_class_names
elif categoryname =="Custom-List":
batch_category_name = custom_category.split(',')
else:

View File

@ -1,28 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
def add_deeplab_config(cfg):
"""
Add config for DeepLab.
"""
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Used for `poly` learning rate schedule.
cfg.SOLVER.POLY_LR_POWER = 0.9
cfg.SOLVER.POLY_LR_CONSTANT_ENDING = 0.0
# Loss type, choose from `cross_entropy`, `hard_pixel_mining`.
cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE = "hard_pixel_mining"
# DeepLab settings
cfg.MODEL.SEM_SEG_HEAD.PROJECT_FEATURES = ["res2"]
cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS = [48]
cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS = 256
cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS = [6, 12, 18]
cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT = 0.1
cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV = False
# Backbone new configs
cfg.MODEL.RESNETS.RES4_DILATION = 1
cfg.MODEL.RESNETS.RES5_MULTI_GRID = [1, 2, 4]
# ResNet stem type from: `basic`, `deeplab`
cfg.MODEL.RESNETS.STEM_TYPE = "deeplab"

View File

@ -1,296 +0,0 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
"""
import torch
import torch.nn.functional as F
from torch import nn
# from ..backbone import build_backbone, Backbone
# from ..body.encoder import build_encoder
# from ..body.decoder import build_decoder
from detectron2.modeling import build_backbone
from .pixel_decoder.maskdino_encoder import build_pixel_decoder
from .transformer_decoder.maskdino_decoder import build_transformer_decoder
import random
from transformers import AutoTokenizer
from collections import OrderedDict
from ..modules.point_features import point_sample
from timm.models.layers import trunc_normal_
from transformers import CLIPTokenizer,CLIPTextModel
from .vos_utils import masks_to_boxes, FeatureFuser
import numpy as np
import math
def rand_sample(x, max_len):
if x.shape[1] <= max_len:
return x
else:
rand_idx = torch.randperm(x.shape[1])[:max_len]
return x[:,rand_idx]
def agg_lang_feat(features, mask, pool_type="average"):
"""average pooling of language features"""
# feat: (bs, seq_len, C)
# mask: (bs, seq_len)
if pool_type == "average":
embedded = features * mask.unsqueeze(-1).float() # use mask to zero out invalid token features
aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float())
elif pool_type == "max":
out = []
for i in range(len(features)):
pool_feat, _ = torch.max(features[i][mask[i]], 0) # (L, C) -> (C, )
out.append(pool_feat)
aggregate = torch.stack(out, dim=0) # (bs, C)
else:
raise ValueError("pool_type should be average or max")
return aggregate
class GLEE_Model(nn.Module):
"""
Main class for mask classification semantic segmentation architectures.
"""
def __init__(self, cfg, matcher, device, video_info, contras_mean):
super().__init__()
self.cfg = cfg
self.matcher = matcher
self.backbone = build_backbone(cfg)
output_channels = [v for k,v in self.backbone._out_feature_channels.items()]
self.sot_fuser = FeatureFuser(output_channels[-3:], 256)
self.tokenizer = CLIPTokenizer.from_pretrained('GLEE/clip_vit_base_patch32')
self.tokenizer.add_special_tokens({'cls_token': self.tokenizer.eos_token})
self.text_encoder = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
# self.text_encoder_teacher = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
self.lang_encoder = None
# for p in self.text_encoder_teacher.parameters():
# p.requires_grad = False
self.lang_projection = nn.Parameter(torch.rand(cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM, cfg.MODEL.DIM_PROJ))
self.text_encode_type = 'clip_teacher'
# self.lang_encoder = None
self.pixel_decoder = build_pixel_decoder(cfg, self.backbone.output_shape())
transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
self.predictor = build_transformer_decoder(cfg, transformer_predictor_in_channels, lang_encoder = self.lang_encoder, mask_classification=True,)
self.to(device)
self.video_info = video_info
self.contras_mean = contras_mean
self.track_loss_version = cfg.MODEL.TRACK_VERSION
self.no_mask_tasks = ['obj365', 'obj365_clip','openimage', 'openimage_clip', 'vg', 'grit', 'bdd_det', 'bdd_track_box']
# for visual prompt
hidden_dim = 256
self.max_spatial_len = [512,512,512,512]
self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(4)])
trunc_normal_(self.mask_sptial_embed[0], std=.02)
trunc_normal_(self.mask_sptial_embed[1], std=.02)
trunc_normal_(self.mask_sptial_embed[2], std=.02)
trunc_normal_(self.mask_sptial_embed[3], std=.02)
# learnable positive negative indicator
self.pn_indicator = nn.Embedding(2, hidden_dim)
@property
def device(self):
return self.pixel_mean.device
def forward(self, images, prompts, task, targets=None, batch_name_list=None, is_train = True, visual_prompt_type='scribble'):
extra = {}
# dist_loss = None
early_semantic = None
if self.text_encode_type == "clip_teacher":
if task not in ['grounding','rvos']:
assert batch_name_list
calsses_name_list = batch_name_list
tokenized = self.tokenizer.batch_encode_plus(calsses_name_list,
max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, # 256
padding='max_length' if self.cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX else "longest", # max_length
return_special_tokens_mask=True,
return_tensors='pt',
truncation=True).to(images.device)
texts = (tokenized['input_ids'], tokenized['attention_mask'])
token_x = self.text_encoder(*texts)['last_hidden_state']
valid_mask = tokenized['attention_mask'].bool()
# token_x_teacher = self.text_encoder_teacher(*texts)['last_hidden_state']
# if is_train:
# dist_loss = F.mse_loss(token_x[valid_mask], token_x_teacher[valid_mask] )
# F.l2_loss(token_x[valid_mask], token_x_teacher[valid_mask] )
token_x = token_x @ self.lang_projection
lang_feat_pool = agg_lang_feat(token_x, tokenized['attention_mask'], pool_type="average") # (bs, 768)
extra['class_embeddings'] = lang_feat_pool
if True: # early_fusion
gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0 #[bs,L]
early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask}
if 'grounding' in prompts:
if self.text_encode_type == 'clip_frozen' or self.text_encode_type == 'clip_teacher':
tokens = self.tokenizer(
prompts['grounding'], padding='max_length', truncation=True, max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, return_tensors='pt'
)
tokens = {key: value.to(images.device) for key, value in tokens.items()}
texts = (tokens['input_ids'], tokens['attention_mask'])
x = self.text_encoder(*texts)
token_x = x['last_hidden_state']
token_x = token_x @ self.lang_projection
extra['grounding_tokens'] = token_x.permute(1,0,2) #[len,bz,C]
non_zero_query_mask = tokens['attention_mask']
lang_feat_pool = agg_lang_feat(token_x, non_zero_query_mask, pool_type="average").unsqueeze(1) # (bs, 1, 768)
dist_loss = (lang_feat_pool*0).sum()
extra['grounding_nonzero_mask'] = ~non_zero_query_mask.bool() # [bz,len]
extra['grounding_class'] = lang_feat_pool.squeeze(1) #[bz,C
# gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
# gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
# gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0 #[bs,L]
# early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask}
early_semantic = {"hidden":token_x.float(),"masks":tokens['attention_mask']>0}
if isinstance(images,torch.Tensor):
features = self.backbone(images)
else:
features = self.backbone(images.tensor)
if 'spatial' in prompts:
## setp 1,2,3
key_images = [ images ] #bz*[1,3,H,W]
key_promptmasks = [m.unsqueeze(0) for m in prompts['spatial']] #bz*[1,1,H,W]
prompt_mode = visual_prompt_type
ref_feats, ref_masks = self.get_template(key_images, key_promptmasks, prompt_mode)
early_fusion = {"hidden":ref_feats,"masks":ref_masks}
if early_semantic is None:
early_semantic = early_fusion
else:
early_semantic["hidden"] = torch.cat([early_semantic["hidden"],early_fusion["hidden"]],dim=1)
early_semantic["masks"] = torch.cat([early_semantic["masks"],early_fusion["masks"]],dim=1)
# bz = len(images)//2
mask_features, _, multi_scale_features, zero_loss = self.pixel_decoder.forward_features(features, masks=None, early_fusion = early_semantic)
if 'spatial' in prompts:
pos_masks = prompts['spatial']
# neg_masks = [~p for p in prompts['spatial']]
neg_masks = [p&False for p in prompts['spatial']]
extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
_,h,w = extra['spatial_query_pos_mask'][0].shape
divisor = torch.tensor([h,w], device=mask_features.device)[None,]
# Get mean pos spatial query
non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)
non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)
spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True) #[(N, C, P)
spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num() # [1,bz,C]
# Get mean neg spatial query
non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num()
# Get layerwise spatial query
src_spatial_queries = []
src_spatial_maskings = []
for i in range(len(multi_scale_features)):
bs,dc,h,w = multi_scale_features[i].shape
# src_mask_features = multi_scale_features[i].view(h,w,bs,dc)
src_mask_features = multi_scale_features[i].permute(2,3,0,1)
src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
non_zero_query_point[non_zero_query_mask] = 0
spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
src_spatial_queries += [spatial_tokens]
src_spatial_maskings += [non_zero_query_mask]
extra['visual_prompt_tokens'] = src_spatial_queries #[len,bz,C]
extra['visual_prompt_nonzero_mask'] = src_spatial_maskings # [bz,len]
outputs = self.predictor(multi_scale_features, mask_features, extra=extra, task=task, masks=None, targets=targets)
return outputs
def get_template(self, imgs, pad_masks, prompt_mode='scribble'):
"""img: (N, 3, H, W), mask: (N, 1, H, W), bbox: (1, 4)"""
"""get 4-channel template"""
croped_img_with_mask = []
for image_i, mask_i in zip( imgs, pad_masks):
if prompt_mode in ['scribble','point']:
image_with_mask = image_i + mask_i.to(image_i)
else:
image_with_mask = image_i
# image_with_mask = torch.cat([image_i,mask_i.to(image_i)],dim=1) #[1,3,H,W]
box_i = masks_to_boxes(mask_i[0]) #[xyxy]
box_i[:, 2:] = box_i[:, 2:] - box_i[:, :2] #xywh
x, y, w, h = box_i[0].long().tolist()
self.search_area_factor=2
crop_sz = math.ceil(math.sqrt(w * h) * self.search_area_factor)
x1 = max(0,round(x + 0.5 * w - crop_sz * 0.5))
x2 = x1 + crop_sz
y1 = max(0,round(y + 0.5 * h - crop_sz * 0.5))
y2 = y1 + crop_sz
im_crop = image_with_mask[:, :, y1:y2, x1:x2]
# resize
if im_crop.shape[-1] ==0 or im_crop.shape[-2] ==0 :
im_crop = image_with_mask
im_crop = F.interpolate(im_crop, (256,256), mode='bilinear', align_corners=False)
croped_img_with_mask.append(im_crop)
croped_img_with_mask = torch.cat(croped_img_with_mask,dim=0) #[bz,3,256,256]
with torch.no_grad():
ref_srcs = self.backbone(croped_img_with_mask.contiguous())
ref_srcs = [v for k,v in ref_srcs.items()]
ref_feats = self.sot_fuser(ref_srcs[1:]).float() #[bz,256,32,32]
ref_feats = ref_feats.flatten(-2).permute(0, 2, 1) # (bs, L, C)
ref_masks = torch.ones_like(ref_feats[:,:,0])>0 #[bs,L]
return ref_feats, ref_masks

View File

@ -1,367 +0,0 @@
categories = [
{'id': 1, 'name': 'Person'},
{'id': 2, 'name': 'Sneakers'},
{'id': 3, 'name': 'Chair'},
{'id': 4, 'name': 'Other Shoes'},
{'id': 5, 'name': 'Hat'},
{'id': 6, 'name': 'Car'},
{'id': 7, 'name': 'Lamp'},
{'id': 8, 'name': 'Glasses'},
{'id': 9, 'name': 'Bottle'},
{'id': 10, 'name': 'Desk'},
{'id': 11, 'name': 'Cup'},
{'id': 12, 'name': 'Street Lights'},
{'id': 13, 'name': 'Cabinet/shelf'},
{'id': 14, 'name': 'Handbag/Satchel'},
{'id': 15, 'name': 'Bracelet'},
{'id': 16, 'name': 'Plate'},
{'id': 17, 'name': 'Picture/Frame'},
{'id': 18, 'name': 'Helmet'},
{'id': 19, 'name': 'Book'},
{'id': 20, 'name': 'Gloves'},
{'id': 21, 'name': 'Storage box'},
{'id': 22, 'name': 'Boat'},
{'id': 23, 'name': 'Leather Shoes'},
{'id': 24, 'name': 'Flower'},
{'id': 25, 'name': 'Bench'},
{'id': 26, 'name': 'Potted Plant'},
{'id': 27, 'name': 'Bowl/Basin'},
{'id': 28, 'name': 'Flag'},
{'id': 29, 'name': 'Pillow'},
{'id': 30, 'name': 'Boots'},
{'id': 31, 'name': 'Vase'},
{'id': 32, 'name': 'Microphone'},
{'id': 33, 'name': 'Necklace'},
{'id': 34, 'name': 'Ring'},
{'id': 35, 'name': 'SUV'},
{'id': 36, 'name': 'Wine Glass'},
{'id': 37, 'name': 'Belt'},
{'id': 38, 'name': 'Moniter/TV'},
{'id': 39, 'name': 'Backpack'},
{'id': 40, 'name': 'Umbrella'},
{'id': 41, 'name': 'Traffic Light'},
{'id': 42, 'name': 'Speaker'},
{'id': 43, 'name': 'Watch'},
{'id': 44, 'name': 'Tie'},
{'id': 45, 'name': 'Trash bin Can'},
{'id': 46, 'name': 'Slippers'},
{'id': 47, 'name': 'Bicycle'},
{'id': 48, 'name': 'Stool'},
{'id': 49, 'name': 'Barrel/bucket'},
{'id': 50, 'name': 'Van'},
{'id': 51, 'name': 'Couch'},
{'id': 52, 'name': 'Sandals'},
{'id': 53, 'name': 'Bakset'},
{'id': 54, 'name': 'Drum'},
{'id': 55, 'name': 'Pen/Pencil'},
{'id': 56, 'name': 'Bus'},
{'id': 57, 'name': 'Wild Bird'},
{'id': 58, 'name': 'High Heels'},
{'id': 59, 'name': 'Motorcycle'},
{'id': 60, 'name': 'Guitar'},
{'id': 61, 'name': 'Carpet'},
{'id': 62, 'name': 'Cell Phone'},
{'id': 63, 'name': 'Bread'},
{'id': 64, 'name': 'Camera'},
{'id': 65, 'name': 'Canned'},
{'id': 66, 'name': 'Truck'},
{'id': 67, 'name': 'Traffic cone'},
{'id': 68, 'name': 'Cymbal'},
{'id': 69, 'name': 'Lifesaver'},
{'id': 70, 'name': 'Towel'},
{'id': 71, 'name': 'Stuffed Toy'},
{'id': 72, 'name': 'Candle'},
{'id': 73, 'name': 'Sailboat'},
{'id': 74, 'name': 'Laptop'},
{'id': 75, 'name': 'Awning'},
{'id': 76, 'name': 'Bed'},
{'id': 77, 'name': 'Faucet'},
{'id': 78, 'name': 'Tent'},
{'id': 79, 'name': 'Horse'},
{'id': 80, 'name': 'Mirror'},
{'id': 81, 'name': 'Power outlet'},
{'id': 82, 'name': 'Sink'},
{'id': 83, 'name': 'Apple'},
{'id': 84, 'name': 'Air Conditioner'},
{'id': 85, 'name': 'Knife'},
{'id': 86, 'name': 'Hockey Stick'},
{'id': 87, 'name': 'Paddle'},
{'id': 88, 'name': 'Pickup Truck'},
{'id': 89, 'name': 'Fork'},
{'id': 90, 'name': 'Traffic Sign'},
{'id': 91, 'name': 'Ballon'},
{'id': 92, 'name': 'Tripod'},
{'id': 93, 'name': 'Dog'},
{'id': 94, 'name': 'Spoon'},
{'id': 95, 'name': 'Clock'},
{'id': 96, 'name': 'Pot'},
{'id': 97, 'name': 'Cow'},
{'id': 98, 'name': 'Cake'},
{'id': 99, 'name': 'Dinning Table'},
{'id': 100, 'name': 'Sheep'},
{'id': 101, 'name': 'Hanger'},
{'id': 102, 'name': 'Blackboard/Whiteboard'},
{'id': 103, 'name': 'Napkin'},
{'id': 104, 'name': 'Other Fish'},
{'id': 105, 'name': 'Orange/Tangerine'},
{'id': 106, 'name': 'Toiletry'},
{'id': 107, 'name': 'Keyboard'},
{'id': 108, 'name': 'Tomato'},
{'id': 109, 'name': 'Lantern'},
{'id': 110, 'name': 'Machinery Vehicle'},
{'id': 111, 'name': 'Fan'},
{'id': 112, 'name': 'Green Vegetables'},
{'id': 113, 'name': 'Banana'},
{'id': 114, 'name': 'Baseball Glove'},
{'id': 115, 'name': 'Airplane'},
{'id': 116, 'name': 'Mouse'},
{'id': 117, 'name': 'Train'},
{'id': 118, 'name': 'Pumpkin'},
{'id': 119, 'name': 'Soccer'},
{'id': 120, 'name': 'Skiboard'},
{'id': 121, 'name': 'Luggage'},
{'id': 122, 'name': 'Nightstand'},
{'id': 123, 'name': 'Tea pot'},
{'id': 124, 'name': 'Telephone'},
{'id': 125, 'name': 'Trolley'},
{'id': 126, 'name': 'Head Phone'},
{'id': 127, 'name': 'Sports Car'},
{'id': 128, 'name': 'Stop Sign'},
{'id': 129, 'name': 'Dessert'},
{'id': 130, 'name': 'Scooter'},
{'id': 131, 'name': 'Stroller'},
{'id': 132, 'name': 'Crane'},
{'id': 133, 'name': 'Remote'},
{'id': 134, 'name': 'Refrigerator'},
{'id': 135, 'name': 'Oven'},
{'id': 136, 'name': 'Lemon'},
{'id': 137, 'name': 'Duck'},
{'id': 138, 'name': 'Baseball Bat'},
{'id': 139, 'name': 'Surveillance Camera'},
{'id': 140, 'name': 'Cat'},
{'id': 141, 'name': 'Jug'},
{'id': 142, 'name': 'Broccoli'},
{'id': 143, 'name': 'Piano'},
{'id': 144, 'name': 'Pizza'},
{'id': 145, 'name': 'Elephant'},
{'id': 146, 'name': 'Skateboard'},
{'id': 147, 'name': 'Surfboard'},
{'id': 148, 'name': 'Gun'},
{'id': 149, 'name': 'Skating and Skiing shoes'},
{'id': 150, 'name': 'Gas stove'},
{'id': 151, 'name': 'Donut'},
{'id': 152, 'name': 'Bow Tie'},
{'id': 153, 'name': 'Carrot'},
{'id': 154, 'name': 'Toilet'},
{'id': 155, 'name': 'Kite'},
{'id': 156, 'name': 'Strawberry'},
{'id': 157, 'name': 'Other Balls'},
{'id': 158, 'name': 'Shovel'},
{'id': 159, 'name': 'Pepper'},
{'id': 160, 'name': 'Computer Box'},
{'id': 161, 'name': 'Toilet Paper'},
{'id': 162, 'name': 'Cleaning Products'},
{'id': 163, 'name': 'Chopsticks'},
{'id': 164, 'name': 'Microwave'},
{'id': 165, 'name': 'Pigeon'},
{'id': 166, 'name': 'Baseball'},
{'id': 167, 'name': 'Cutting/chopping Board'},
{'id': 168, 'name': 'Coffee Table'},
{'id': 169, 'name': 'Side Table'},
{'id': 170, 'name': 'Scissors'},
{'id': 171, 'name': 'Marker'},
{'id': 172, 'name': 'Pie'},
{'id': 173, 'name': 'Ladder'},
{'id': 174, 'name': 'Snowboard'},
{'id': 175, 'name': 'Cookies'},
{'id': 176, 'name': 'Radiator'},
{'id': 177, 'name': 'Fire Hydrant'},
{'id': 178, 'name': 'Basketball'},
{'id': 179, 'name': 'Zebra'},
{'id': 180, 'name': 'Grape'},
{'id': 181, 'name': 'Giraffe'},
{'id': 182, 'name': 'Potato'},
{'id': 183, 'name': 'Sausage'},
{'id': 184, 'name': 'Tricycle'},
{'id': 185, 'name': 'Violin'},
{'id': 186, 'name': 'Egg'},
{'id': 187, 'name': 'Fire Extinguisher'},
{'id': 188, 'name': 'Candy'},
{'id': 189, 'name': 'Fire Truck'},
{'id': 190, 'name': 'Billards'},
{'id': 191, 'name': 'Converter'},
{'id': 192, 'name': 'Bathtub'},
{'id': 193, 'name': 'Wheelchair'},
{'id': 194, 'name': 'Golf Club'},
{'id': 195, 'name': 'Briefcase'},
{'id': 196, 'name': 'Cucumber'},
{'id': 197, 'name': 'Cigar/Cigarette '},
{'id': 198, 'name': 'Paint Brush'},
{'id': 199, 'name': 'Pear'},
{'id': 200, 'name': 'Heavy Truck'},
{'id': 201, 'name': 'Hamburger'},
{'id': 202, 'name': 'Extractor'},
{'id': 203, 'name': 'Extention Cord'},
{'id': 204, 'name': 'Tong'},
{'id': 205, 'name': 'Tennis Racket'},
{'id': 206, 'name': 'Folder'},
{'id': 207, 'name': 'American Football'},
{'id': 208, 'name': 'earphone'},
{'id': 209, 'name': 'Mask'},
{'id': 210, 'name': 'Kettle'},
{'id': 211, 'name': 'Tennis'},
{'id': 212, 'name': 'Ship'},
{'id': 213, 'name': 'Swing'},
{'id': 214, 'name': 'Coffee Machine'},
{'id': 215, 'name': 'Slide'},
{'id': 216, 'name': 'Carriage'},
{'id': 217, 'name': 'Onion'},
{'id': 218, 'name': 'Green beans'},
{'id': 219, 'name': 'Projector'},
{'id': 220, 'name': 'Frisbee'},
{'id': 221, 'name': 'Washing Machine/Drying Machine'},
{'id': 222, 'name': 'Chicken'},
{'id': 223, 'name': 'Printer'},
{'id': 224, 'name': 'Watermelon'},
{'id': 225, 'name': 'Saxophone'},
{'id': 226, 'name': 'Tissue'},
{'id': 227, 'name': 'Toothbrush'},
{'id': 228, 'name': 'Ice cream'},
{'id': 229, 'name': 'Hotair ballon'},
{'id': 230, 'name': 'Cello'},
{'id': 231, 'name': 'French Fries'},
{'id': 232, 'name': 'Scale'},
{'id': 233, 'name': 'Trophy'},
{'id': 234, 'name': 'Cabbage'},
{'id': 235, 'name': 'Hot dog'},
{'id': 236, 'name': 'Blender'},
{'id': 237, 'name': 'Peach'},
{'id': 238, 'name': 'Rice'},
{'id': 239, 'name': 'Wallet/Purse'},
{'id': 240, 'name': 'Volleyball'},
{'id': 241, 'name': 'Deer'},
{'id': 242, 'name': 'Goose'},
{'id': 243, 'name': 'Tape'},
{'id': 244, 'name': 'Tablet'},
{'id': 245, 'name': 'Cosmetics'},
{'id': 246, 'name': 'Trumpet'},
{'id': 247, 'name': 'Pineapple'},
{'id': 248, 'name': 'Golf Ball'},
{'id': 249, 'name': 'Ambulance'},
{'id': 250, 'name': 'Parking meter'},
{'id': 251, 'name': 'Mango'},
{'id': 252, 'name': 'Key'},
{'id': 253, 'name': 'Hurdle'},
{'id': 254, 'name': 'Fishing Rod'},
{'id': 255, 'name': 'Medal'},
{'id': 256, 'name': 'Flute'},
{'id': 257, 'name': 'Brush'},
{'id': 258, 'name': 'Penguin'},
{'id': 259, 'name': 'Megaphone'},
{'id': 260, 'name': 'Corn'},
{'id': 261, 'name': 'Lettuce'},
{'id': 262, 'name': 'Garlic'},
{'id': 263, 'name': 'Swan'},
{'id': 264, 'name': 'Helicopter'},
{'id': 265, 'name': 'Green Onion'},
{'id': 266, 'name': 'Sandwich'},
{'id': 267, 'name': 'Nuts'},
{'id': 268, 'name': 'Speed Limit Sign'},
{'id': 269, 'name': 'Induction Cooker'},
{'id': 270, 'name': 'Broom'},
{'id': 271, 'name': 'Trombone'},
{'id': 272, 'name': 'Plum'},
{'id': 273, 'name': 'Rickshaw'},
{'id': 274, 'name': 'Goldfish'},
{'id': 275, 'name': 'Kiwi fruit'},
{'id': 276, 'name': 'Router/modem'},
{'id': 277, 'name': 'Poker Card'},
{'id': 278, 'name': 'Toaster'},
{'id': 279, 'name': 'Shrimp'},
{'id': 280, 'name': 'Sushi'},
{'id': 281, 'name': 'Cheese'},
{'id': 282, 'name': 'Notepaper'},
{'id': 283, 'name': 'Cherry'},
{'id': 284, 'name': 'Pliers'},
{'id': 285, 'name': 'CD'},
{'id': 286, 'name': 'Pasta'},
{'id': 287, 'name': 'Hammer'},
{'id': 288, 'name': 'Cue'},
{'id': 289, 'name': 'Avocado'},
{'id': 290, 'name': 'Hamimelon'},
{'id': 291, 'name': 'Flask'},
{'id': 292, 'name': 'Mushroon'},
{'id': 293, 'name': 'Screwdriver'},
{'id': 294, 'name': 'Soap'},
{'id': 295, 'name': 'Recorder'},
{'id': 296, 'name': 'Bear'},
{'id': 297, 'name': 'Eggplant'},
{'id': 298, 'name': 'Board Eraser'},
{'id': 299, 'name': 'Coconut'},
{'id': 300, 'name': 'Tape Measur/ Ruler'},
{'id': 301, 'name': 'Pig'},
{'id': 302, 'name': 'Showerhead'},
{'id': 303, 'name': 'Globe'},
{'id': 304, 'name': 'Chips'},
{'id': 305, 'name': 'Steak'},
{'id': 306, 'name': 'Crosswalk Sign'},
{'id': 307, 'name': 'Stapler'},
{'id': 308, 'name': 'Campel'},
{'id': 309, 'name': 'Formula 1 '},
{'id': 310, 'name': 'Pomegranate'},
{'id': 311, 'name': 'Dishwasher'},
{'id': 312, 'name': 'Crab'},
{'id': 313, 'name': 'Hoverboard'},
{'id': 314, 'name': 'Meat ball'},
{'id': 315, 'name': 'Rice Cooker'},
{'id': 316, 'name': 'Tuba'},
{'id': 317, 'name': 'Calculator'},
{'id': 318, 'name': 'Papaya'},
{'id': 319, 'name': 'Antelope'},
{'id': 320, 'name': 'Parrot'},
{'id': 321, 'name': 'Seal'},
{'id': 322, 'name': 'Buttefly'},
{'id': 323, 'name': 'Dumbbell'},
{'id': 324, 'name': 'Donkey'},
{'id': 325, 'name': 'Lion'},
{'id': 326, 'name': 'Urinal'},
{'id': 327, 'name': 'Dolphin'},
{'id': 328, 'name': 'Electric Drill'},
{'id': 329, 'name': 'Hair Dryer'},
{'id': 330, 'name': 'Egg tart'},
{'id': 331, 'name': 'Jellyfish'},
{'id': 332, 'name': 'Treadmill'},
{'id': 333, 'name': 'Lighter'},
{'id': 334, 'name': 'Grapefruit'},
{'id': 335, 'name': 'Game board'},
{'id': 336, 'name': 'Mop'},
{'id': 337, 'name': 'Radish'},
{'id': 338, 'name': 'Baozi'},
{'id': 339, 'name': 'Target'},
{'id': 340, 'name': 'French'},
{'id': 341, 'name': 'Spring Rolls'},
{'id': 342, 'name': 'Monkey'},
{'id': 343, 'name': 'Rabbit'},
{'id': 344, 'name': 'Pencil Case'},
{'id': 345, 'name': 'Yak'},
{'id': 346, 'name': 'Red Cabbage'},
{'id': 347, 'name': 'Binoculars'},
{'id': 348, 'name': 'Asparagus'},
{'id': 349, 'name': 'Barbell'},
{'id': 350, 'name': 'Scallop'},
{'id': 351, 'name': 'Noddles'},
{'id': 352, 'name': 'Comb'},
{'id': 353, 'name': 'Dumpling'},
{'id': 354, 'name': 'Oyster'},
{'id': 355, 'name': 'Table Teniis paddle'},
{'id': 356, 'name': 'Cosmetics Brush/Eyeliner Pencil'},
{'id': 357, 'name': 'Chainsaw'},
{'id': 358, 'name': 'Eraser'},
{'id': 359, 'name': 'Lobster'},
{'id': 360, 'name': 'Durian'},
{'id': 361, 'name': 'Okra'},
{'id': 362, 'name': 'Lipstick'},
{'id': 363, 'name': 'Cosmetics Mirror'},
{'id': 364, 'name': 'Curling'},
{'id': 365, 'name': 'Table Tennis '},
]

View File

@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.1.0
torchvision
scipy
opencv-python
timm
transformers
einops

274
assets/DATA.md 100644
View File

@ -0,0 +1,274 @@
# Data Preparation
**Here details how to prepare all the datasets used in the training and testing stages of GLEE.**
GLEE used the following 16 datasets for joint training, and perform zero-shot evaluation on additional 6 datasets. Among them, for Objects365, RefCOCO series, YouTubeVOS, Ref-YouTubeVOS, and BDD data, we followed UNINEXT for preprocessing. For the preprocessing of these datasets, you can refer to UNINEXT. For users who only want to test or continue fine-tune on part of the datasets, there is no need of downloading all datasets.
## For Training
### COCO
Please download [COCO](https://cocodataset.org/#home) from the offical website. We use [train2017.zip](http://images.cocodataset.org/zips/train2017.zip), [train2014.zip](http://images.cocodataset.org/zips/train2014.zip), [val2017.zip](http://images.cocodataset.org/zips/val2017.zip), [test2017.zip](http://images.cocodataset.org/zips/test2017.zip) & [annotations_trainval2017.zip](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), [image_info_test2017.zip](http://images.cocodataset.org/annotations/image_info_test2017.zip). We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- coco
-- annotations
-- train2017
-- train2014
-- val2017
-- test2017
```
### LVIS
Please download [LVISv1](https://www.lvisdataset.org/dataset) from the offical website. LVIS uses the COCO 2017 train, validation, and test image sets, so only Annotation needs to be downloaded[lvis_v1_train.json.zip](https://dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip), [lvis_v1_val.json.zip](https://dl.fbaipublicfiles.com/LVIS/lvis_v1_val.json.zip), [lvis_v1_minival_inserted_image_name.json](https://huggingface.co/GLIPModel/GLIP/resolve/main/lvis_v1_minival_inserted_image_name.json). We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- lvis
-- lvis_v1_train.json
-- lvis_v1_val.json
-- lvis_v1_minival_inserted_image_name.json
```
### VisualGenome
Please download [VisualGenome](https://homes.cs.washington.edu/~ranjay/visualgenome/api.html) images from the offical website: [part 1 (9.2 GB)](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip), [part 2 (5.47 GB)](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip), and download our preprocessed annotation file: [train.json](), [train_from_objects.json]() . We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- visual_genome
-- images
-- *.jpg
...
-- annotations
-- train_from_objects.json
-- train.json
```
### OpenImages
Please download [OpenImages v6](https://storage.googleapis.com/openimages/web/download_v6.html) images from the offical website, all detection annotations need to be preprocessed into coco format. We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- openimages
-- detection
-- openimages_v6_train_bbox.json
```
### VIS
Download YouTube-VIS [2019](https://codalab.lisn.upsaclay.fr/competitions/6064#participate-get_data), [2021](https://codalab.lisn.upsaclay.fr/competitions/7680#participate-get_data), [OVIS](https://codalab.lisn.upsaclay.fr/competitions/4763#participate) dataset for video instance segmentation task, and it is necessary to convert their video annotation into coco format in advance for image-level joint-training by run: ```python3 conversion/conver_vis2coco.py``` We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- ytvis_2019
-- train
-- val
-- annotations
-- instances_train_sub.json
-- instances_val_sub.json
-- ytvis19_cocofmt.json
-- ytvis_2021
-- train
-- val
-- annotations
-- instances_train_sub.json
-- instances_val_sub.json
-- ytvis21_cocofmt.json
-- ovis
-- train
-- val
-- annotations_train.json
-- annotations_valid.json
-- ovis_cocofmt.json
```
### SA1B
We downloaded data from the [SA1B](https://ai.meta.com/datasets/segment-anything-downloads/) official website, and only use [sa_000000.tar ~ sa_000050.tar] to preprocess into the required format and train the model. First, perform NMS operations on each sa_n directory to keep the larger object-level masks by running :
```python
python3 convert_sam2coco_rewritresa1b.py --src sa_000000
python3 convert_sam2coco_rewritresa1b.py --src sa_000001
python3 convert_sam2coco_rewritresa1b.py --src sa_000002
python3 convert_sam2coco_rewritresa1b.py --src sa_000003
...
python3 convert_sam2coco_rewritresa1b.py --src sa_000050
```
then merge all the annotations by running xxx.py.
``` python
python3 merge_sa1b.py
```
We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- SA1B
-- images
-- sa_000000
-- sa_1.jpg
-- sa_1.json
-- ...
-- sa_000001
-- ...
-- sa1b_subtrain_500k.json
-- sa1b_subtrain_1m.json
-- sa1b_subtrain_2m.json
```
### UVO
Please download [UVO](https://sites.google.com/view/unidentified-video-object/dataset) from the offical website, and download our preprocessed annotation file [annotations]():
We expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- UVO
-- uvo_videos_dense_frames_jpg
-- uvo_videos_sparse_frames_jpg
-- uvo_videos_frames
-- annotations
-- FrameSet
-- UVO_frame_train_onecate.json
-- UVO_frame_val_onecate.json
-- VideoDenseSet
-- UVO_video_train_dense_objectlabel.json
-- UVO_video_val_dense_objectlabel.json
```
### Objects365 and others
Following UNINEXT, we prepare **Objects365, RefCOCO series, YouTubeVOS, Ref-YouTubeVOS, and BDD** data, and we expect that they are organized as below:
```
${GLEE_ROOT}
-- datasets
-- Objects365v2
-- annotations
-- zhiyuan_objv2_train_new.json
-- zhiyuan_objv2_val_new.json
-- images
-- annotations
-- refcoco-unc
-- refcocog-umd
-- refcocoplus-unc
-- ytbvos18
-- train
-- val
-- ref-youtube-vos
-- meta_expressions
-- train
-- valid
-- train.json
-- valid.json
-- RVOS_refcocofmt.json
-- bdd
-- images
-- 10k
-- 100k
-- seg_track_20
-- track
-- labels
-- box_track_20
-- det_20
-- ins_seg
-- seg_track_20
```
RVOS_refcocofmt.json is the conversion of the annotation of ref-youtube-vos into the format of RefCOCO, which is used for image-level training. It can be converted by run ```python3 conversion/ref-ytbvos-conversion.py```
## For Evaluation Only
The following datasets are only used for zero-shot evaluation, and are not used in joint-training.
### OmniLabel
Please download [OmniLabel](https://www.omnilabel.org/dataset/download) from the offical website, and download our converted annotation in coco formation: [omnilabel](). we expect that the data is organized as below.
```
${GLEE_ROOT}
-- datasets
-- omnilabel
-- images
-- coco
-- object365
-- openimagesv5
-- omnilabel_coco.json
-- omnilabel_obj365.json
-- omnilabel_openimages.json
-- omnilabel_cocofmt.json
```
### ODinW
We follow [GLIP](https://github.com/microsoft/GLIP) to prepare the ODinW 35 dataset, and run ```python3 download.py ``` to download it:
rganized as below.
```
${GLEE_ROOT}
-- datasets
-- odinw
-- dataset
-- coAerialMaritimeDroneco
-- CottontailRabbits
-- NorthAmericaMushrooms
-- ...
```
###
## Updating...
### TAO
### BURST
### LV-VIS
### MOSE

34
assets/INSTALL.md 100644
View File

@ -0,0 +1,34 @@
# Install
## Requirements
We test the codes in the following environments, other versions may also be compatible but Pytorch vision should be >= 1.7
- CUDA 12.1
- Python 3.9.2
- Pytorch 2.1.0
- Torchvison 0.16.0
## Install environment for UNINEXT
```
pip3 install shapely==1.7.1
pip3 install lvis
pip3 install scipy
pip3 install fairscale
pip3 install einops
pip3 install xformers
pip3 install tensorboard
pip3 install opencv-python-headless
pip3 install timm
pip3 install ftfy
pip3 install transformers==4.36.0
pip3 install -e .
pip3 install git+https://github.com/wjf5203/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI" --user
# compile Deformable DETR
cd projects/GLEE/glee/models/pixel_decoder/ops/
python3 setup.py build install --user
```

View File

@ -0,0 +1,41 @@
# GLEE MODEL ZOO
## Introduction
GLEE maintains state-of-the-art (SOTA) performance across multiple tasks while preserving versatility and openness, demonstrating strong generalization capabilities. Here, we provide the model weights for all three stages of GLEE: '-pretrain', '-joint', and '-scaleup'. The '-pretrain' weights refer to those pretrained on Objects365 and OpenImages, yielding effective initializations from over three million detection data. The '-joint' weights are derived from joint training on 15 datasets, where the model achieves optimal performance. The '-scaleup' weights are obtained by incorporating additional automatically annotated SA1B and GRIT data, which enhance zero-shot performance and support a richer semantic understanding. Additionally, we offer weights fine-tuned on VOS data for interactive video tracking applications.
### Stage 1: Pretraining
| Name | Config | Weight |
| :----------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
| GLEE-Lite-pretrain | Stage1_pretrain_openimage_obj365_CLIPfrozen_R50.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_pretrain.pth) |
| GLEE-Plus-pretrain | Stage1_pretrain_openimage_obj365_CLIPfrozen_SwinL.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_pretrain.pth) |
| GLEE-Pro-pretrain | Stage1_pretrain_openimage_obj365_CLIPfrozen_EVA02L_LSJ1536.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_pretrain.pth) |
### Stage 2: Image-level Joint Training
| Name | Config | Weight |
| :-------------: | :-------------------------------------------: | :----------------------------------------------------------: |
| GLEE-Lite-joint | Stage2_joint_training_CLIPteacher_R50.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_joint.pth) |
| GLEE-Plus-joint | Stage2_joint_training_CLIPteacher_SwinL | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_joint.pth) |
| GLEE-Pro-joint | Stage2_joint_training_CLIPteacher_EVA02L.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_joint.pth) |
### Stage 3: Scale-up Training
| Name | Config | Weight |
| :---------------: | :------------------------------------: | :----------------------------------------------------------: |
| GLEE-Lite-scaleup | Stage3_scaleup_CLIPteacher_R50.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_scaleup.pth) |
| GLEE-Plus-scaleup | Stage3_scaleup_CLIPteacher_SwinL | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_scaleup.pth) |
| GLEE-Pro-scaleup | Stage3_scaleup_CLIPteacher_EVA02L.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_scaleup.pth) |
### Single Tasks
We also provide models trained on a VOS task with ResNet-50 backbone:
| Name | Config | Weight |
| :-----------: | :-------------------------: | :----------------------------------------------------------: |
| GLEE-Lite-vos | VOS_joint_finetune_R50.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_vos.pth) |

59
assets/TEST.md 100644
View File

@ -0,0 +1,59 @@
# Tutorial for Testing (Continuously Updated)
GLEE can be directly tested on classic detection and segmentation datasets locally. For some video datasets, the results need to be submitted to Codalab for evaluation. Additionally, certain datasets such as TAO, BURST, and OmniLabel require evaluation using additional tools. We will continue to update the evaluation tutorials for all datasets reported in the paper here.
## DetectionInstance SegmentationREC&RES
GLEE can directly perform evaluations on COCO, Objects365, LVIS, and the RefCOCO series based on Detectron2. Typically, the Stage 2 yaml config file can be used, with manual adjustments made for the dataset to be inferred and the selection of weights to be downloaded from the [MODEL_ZOO.md](MODEL_ZOO.md).
To inference on COCO:
```bash
# Lite
python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml --num-gpus 8 --eval-only MODEL.WEIGHTS path/to/GLEE_Lite_joint.pth DATASETS.TEST '("coco_2017_val",)'
# Plus
python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Plus/Stage2_joint_training_CLIPteacher_SwinL.yaml --num-gpus 8 --eval-only MODEL.WEIGHTS path/to/GLEE_Plus_joint.pth DATASETS.TEST '("coco_2017_val",)'
# Pro
python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Pro/Stage2_joint_training_CLIPteacher_EVA02L.yaml --num-gpus 8 --eval-only MODEL.WEIGHTS path/to/GLEE_Pro_joint.pth DATASETS.TEST '("coco_2017_val",)'
```
Replace `"path/to/downloaded/weights"` with the actual path to the pretrained model weights and use `"DATASETS.TEST"` to specific the dataset you wish to evaluate on.
`'("coco_2017_val",)'` can be replace by :
```bash
# Lite
python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml --num-gpus 8 --eval-only MODEL.WEIGHTS path/to/GLEE_Lite_joint.pth DATASETS.TEST
'("coco_2017_val",)'
'("lvis_v1_minival",)'
'("lvis_v1_val",)'
'("objects365_v2_val",)'
'("refcoco-unc-val",)'
'("refcoco-unc-testA",)'
'("refcoco-unc-testB",)'
'("refcocoplus-unc-val",)'
'("refcocoplus-unc-testA",)'
'("refcocoplus-unc-testB",)'
'("refcocog-umd-val",)'
'("refcocog-umd-test",)'
# Alternatively, to infer across all tasks at once:
'("coco_2017_val","lvis_v1_minival","lvis_v1_val","objects365_v2_val","refcoco-unc-val","refcoco-unc-testA","refcoco-unc-testB","refcocoplus-unc-val","refcocoplus-unc-testA","refcocoplus-unc-testB","refcocog-umd-val","refcocog-umd-test",)'
```
# Video Tasks (Continuously Updated)
# Omnilabel and ODinW (Continuously Updated)

93
assets/TRAIN.md 100644
View File

@ -0,0 +1,93 @@
# Tutorial for Training
GLEE has three training stages: (1) Objects365 & OpenImages pretraining (2) image-level joint training across 15 datasets (3) scale up training by integrating additional SA1B and GRIT data. Corresponding yaml files start with `Stage1`, `Stage2`, and `Stage3` respectively.
By default, we train GLEE using 64 A100 GPUs with the batchsize of 128. For fine-tuning on video tasks or novel downstream image tasks (ODinW), we default to using eight A100 GPUs. Users interested in specific datasets or aiming to further improve performance by training on individual datasets can adjust the `DATASETS` config within the YAML configuration file.
We provide configurations for Stage 1, 2, and 3 training with three types of backbones—ResNet50, Swin-Large, and EVA02-Large—across the Lite, Plus, and Pro variants, under the [projects/GLEE/configs](../projects/GLEE/configs) folder. For employing larger or novel backbones, it is advisable to initialize the components beyond the backbone with the pretrained weights from GLEE-Lite-joint to expedite convergence.
## Pretrained Backbone Weights
```bash
# Language Model (CLIP text encoder)
wget -P projects/GLEE/clip_vit_base_patch32/ https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE/clip_vit_base_patch32/pytorch_model.bin
# R50 (GLEE_Lite) warmup initialized weight
# The randomly initialized Transformer Decoder is difficult to converge when combined with the large vocabulary of Objects365 and OpenImages.
# It is recommended to use the Transformer weights of MaskDINO (with region proposal capability) to initialize and accelerate convergence.
cd weights/
wget https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/converted_maskdino_r50_withoutclip.pth
# Swin Large backbone weight
wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
# EVA02-Large backbone weight
wget https://huggingface.co/Yuxin-CV/EVA-02/resolve/main/eva02/pt/eva02_L_pt_m38m_p14to16.pt
# Convert EVA02 weights
python3 convert_eva02.py
```
Other pretrained GLEE models can be found in [MODEL_ZOO.md](MODEL_ZOO.md)
## Joint Training
To train from scratch, it is necessary to follow the sequence of stages 1, 2, and 3, executing the training scripts in order, with each stage building upon the weights from the previous one.
For training on a single machine, you can execute the following command:
```bash
python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/<config_stageX.yaml> --num-gpus 8
```
Replace `<config_stageX.yaml>` with the actual configuration file for each stage:
```
${GLEE_ROOT}
-- projects
-- GLEE
-- configs
-- images
-- Lite
-- Stage1_pretrain_openimage_obj365_CLIPfrozen_R50.yaml
-- Stage2_joint_training_CLIPteacher_R50.yaml
-- Stage3_scaleup_CLIPteacher_R50.yaml
-- Plus
-- Stage1_pretrain_openimage_obj365_CLIPfrozen_SwinL.yaml
-- Stage2_joint_training_CLIPteacher_SwinL.yaml
-- Stage3_scaleup_CLIPteacher_SwinL.yaml
-- Pro
-- Stage1_pretrain_openimage_obj365_CLIPfrozen_EVA02L_LSJ1536.yaml
-- Stage2_joint_training_CLIPteacher_EVA02L.yaml
-- Stage3_scaleup_CLIPteacher_EVA02L.yaml
```
Our standard setup involves training on multiple machines (64 x A100), for which you can use the distributed training script:
```bash
python3 launch.py --nn <num_machines> --port <PORT> --worker_rank <Global_Rank> --master_address $<MASTER_ADDRESS> --config-file projects/STAnything/configs/<config_stageX.yaml>
```
Here, `<num_machines>` should be replaced with the number of machines you intend to use, `<MASTER_ADDRESS>` should be the IP address of node 0. `<PORT>` should be the same among multiple nodes. , and `<config.yaml>` with the configuration file for the specific stage of training.
# Finetune (Continuously Updated)
We also provide fine-tuning scripts that enable the fine-tuning of GLEE on downstream tasks such as ODinW and various of video tasks to achieve better performance.
These will be made available as soon as possible.

View File

@ -0,0 +1,18 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RPN:
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "Res5ROIHeads"
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,31 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RESNETS:
OUT_FEATURES: ["res5"]
RES5_DILATION: 2
RPN:
IN_FEATURES: ["res5"]
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["res5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,42 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,25 @@
MODEL:
META_ARCHITECTURE: "RetinaNet"
BACKBONE:
NAME: "build_retinanet_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
FPN:
IN_FEATURES: ["res3", "res4", "res5"]
RETINANET:
IOU_THRESHOLDS: [0.4, 0.5]
IOU_LABELS: [0, -1, 1]
SMOOTH_L1_LOSS_BETA: 0.0
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,17 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
LOAD_PROPOSALS: True
RESNETS:
DEPTH: 50
PROPOSAL_GENERATOR:
NAME: "PrecomputedProposals"
DATASETS:
TRAIN: ("coco_2017_train",)
PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
TEST: ("coco_2017_val",)
PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
DATALOADER:
# proposals are part of the dataset_dicts, and take a lot of RAM
NUM_WORKERS: 2

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,13 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: False
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,11 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.fcos import model
from ..common.train import train
dataloader.train.mapper.use_instance_mask = False
optimizer.lr = 0.01
model.backbone.bottom_up.freeze_at = 2
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,11 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.retinanet import model
from ..common.train import train
dataloader.train.mapper.use_instance_mask = False
model.backbone.bottom_up.freeze_at = 2
optimizer.lr = 0.01
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,5 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,10 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
META_ARCHITECTURE: "ProposalNetwork"
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
RPN:
PRE_NMS_TOPK_TEST: 12000
POST_NMS_TOPK_TEST: 2000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "ProposalNetwork"
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
RPN:
POST_NMS_TOPK_TEST: 2000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,8 @@
from ..common.train import train
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.mask_rcnn_c4 import model
model.backbone.freeze_at = 2
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,8 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.mask_rcnn_fpn import model
from ..common.train import train
model.backbone.bottom_up.freeze_at = 2
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,12 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
RPN:
BBOX_REG_LOSS_TYPE: "giou"
BBOX_REG_LOSS_WEIGHT: 2.0
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: "giou"
BBOX_REG_LOSS_WEIGHT: 10.0

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,13 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: True
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,34 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.mask_rcnn_fpn import model
from ..common.train import train
from detectron2.config import LazyCall as L
from detectron2.modeling.backbone import RegNet
from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
# Replace default ResNet with RegNetX-4GF from the DDS paper. Config source:
# https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnetx/RegNetX-4.0GF_dds_8gpu.yaml#L4-L9 # noqa
model.backbone.bottom_up = L(RegNet)(
stem_class=SimpleStem,
stem_width=32,
block_class=ResBottleneckBlock,
depth=23,
w_a=38.65,
w_0=96,
w_m=2.43,
group_width=40,
freeze_at=2,
norm="FrozenBN",
out_features=["s1", "s2", "s3", "s4"],
)
model.pixel_std = [57.375, 57.120, 58.395]
optimizer.weight_decay = 5e-5
train.init_checkpoint = (
"https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906383/RegNetX-4.0GF_dds_8gpu.pyth"
)
# RegNets benefit from enabling cudnn benchmark mode
train.cudnn_benchmark = True

View File

@ -0,0 +1,35 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco import dataloader
from ..common.models.mask_rcnn_fpn import model
from ..common.train import train
from detectron2.config import LazyCall as L
from detectron2.modeling.backbone import RegNet
from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
# Replace default ResNet with RegNetY-4GF from the DDS paper. Config source:
# https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnety/RegNetY-4.0GF_dds_8gpu.yaml#L4-L10 # noqa
model.backbone.bottom_up = L(RegNet)(
stem_class=SimpleStem,
stem_width=32,
block_class=ResBottleneckBlock,
depth=22,
w_a=31.41,
w_0=96,
w_m=2.24,
group_width=64,
se_ratio=0.25,
freeze_at=2,
norm="FrozenBN",
out_features=["s1", "s2", "s3", "s4"],
)
model.pixel_std = [57.375, 57.120, 58.395]
optimizer.weight_decay = 5e-5
train.init_checkpoint = (
"https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906838/RegNetY-4.0GF_dds_8gpu.pyth"
)
# RegNets benefit from enabling cudnn benchmark mode
train.cudnn_benchmark = True

View File

@ -0,0 +1,15 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
KEYPOINT_ON: True
ROI_HEADS:
NUM_CLASSES: 1
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 0.5 # Keypoint AP degrades (though box AP improves) when using plain L1 loss
RPN:
# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
# 1000 proposals per-image is found to hurt box AP.
# Therefore we increase it to 1500 per-image.
POST_NMS_TOPK_TRAIN: 1500
DATASETS:
TRAIN: ("keypoints_coco_2017_train",)
TEST: ("keypoints_coco_2017_val",)

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,8 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco_keypoint import dataloader
from ..common.models.keypoint_rcnn_fpn import model
from ..common.train import train
model.backbone.bottom_up.freeze_at = 2
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,5 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,12 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,11 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "PanopticFPN"
MASK_ON: True
SEM_SEG_HEAD:
LOSS_WEIGHT: 0.5
DATASETS:
TRAIN: ("coco_2017_train_panoptic_separated",)
TEST: ("coco_2017_val_panoptic_separated",)
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,8 @@
from ..common.optim import SGD as optimizer
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.data.coco_panoptic_separated import dataloader
from ..common.models.panoptic_fpn import model
from ..common.train import train
model.backbone.bottom_up.freeze_at = 2
train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"

View File

@ -0,0 +1,5 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,27 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
# WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
# For better, more stable performance initialize from COCO
WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
MASK_ON: True
ROI_HEADS:
NUM_CLASSES: 8
# This is similar to the setting used in Mask R-CNN paper, Appendix A
# But there are some differences, e.g., we did not initialize the output
# layer using the corresponding classes from COCO
INPUT:
MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 1024
MAX_SIZE_TRAIN: 2048
MAX_SIZE_TEST: 2048
DATASETS:
TRAIN: ("cityscapes_fine_instance_seg_train",)
TEST: ("cityscapes_fine_instance_seg_val",)
SOLVER:
BASE_LR: 0.01
STEPS: (18000,)
MAX_ITER: 24000
IMS_PER_BATCH: 8
TEST:
EVAL_PERIOD: 8000

View File

@ -0,0 +1,84 @@
Detectron2 model zoo's experimental settings and a few implementation details are different from Detectron.
The differences in implementation details are shared in
[Compatibility with Other Libraries](../../docs/notes/compatibility.md).
The differences in model zoo's experimental settings include:
* Use scale augmentation during training. This improves AP with lower training cost.
* Use L1 loss instead of smooth L1 loss for simplicity. This sometimes improves box AP but may
affect other AP.
* Use `POOLER_SAMPLING_RATIO=0` instead of 2. This does not significantly affect AP.
* Use `ROIAlignV2`. This does not significantly affect AP.
In this directory, we provide a few configs that __do not__ have the above changes.
They mimic Detectron's behavior as close as possible,
and provide a fair comparison of accuracy and speed against Detectron.
<!--
./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP --base-dir ../../../configs/Detectron1-Comparisons
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">kp.<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
<tr><td align="left"><a href="faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.219</td>
<td align="center">0.038</td>
<td align="center">3.1</td>
<td align="center">36.9</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">137781054</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
</tr>
<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.313</td>
<td align="center">0.071</td>
<td align="center">5.0</td>
<td align="center">53.1</td>
<td align="center"></td>
<td align="center">64.2</td>
<td align="center">137781195</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
<tr><td align="left"><a href="mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.273</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">37.8</td>
<td align="center">34.9</td>
<td align="center"></td>
<td align="center">137781281</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
</tr>
</tbody></table>
## Comparisons:
* Faster R-CNN: Detectron's AP is 36.7, similar to ours.
* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
[bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
compensated back by some parameter tuning.
* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
See [this article](https://ppwwyyxx.com/blog/2021/Where-are-Pixels/) for details.
For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).

View File

@ -0,0 +1,17 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
RPN:
SMOOTH_L1_BETA: 0.1111
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
INPUT:
# no scale augmentation
MIN_SIZE_TRAIN: (800, )

View File

@ -0,0 +1,27 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
KEYPOINT_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1
ROI_KEYPOINT_HEAD:
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
RPN:
SMOOTH_L1_BETA: 0.1111
# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
# 1000 proposals per-image is found to hurt box AP.
# Therefore we increase it to 1500 per-image.
POST_NMS_TOPK_TRAIN: 1500
DATASETS:
TRAIN: ("keypoints_coco_2017_train",)
TEST: ("keypoints_coco_2017_val",)

View File

@ -0,0 +1,20 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
RPN:
SMOOTH_L1_BETA: 0.1111
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
ROI_MASK_HEAD:
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
INPUT:
# no scale augmentation
MIN_SIZE_TRAIN: (800, )

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,23 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
MASK_ON: True
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,22 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1203
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v1_train",)
TEST: ("lvis_v1_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
SOLVER:
STEPS: (120000, 160000)
MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,22 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1203
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v1_train",)
TEST: ("lvis_v1_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
SOLVER:
STEPS: (120000, 160000)
MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,26 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
MASK_ON: True
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1203
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v1_train",)
TEST: ("lvis_v1_val",)
SOLVER:
STEPS: (120000, 160000)
MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,12 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
RPN:
POST_NMS_TOPK_TRAIN: 2000

View File

@ -0,0 +1,15 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,36 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: True
WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 152
DEFORM_ON_PER_STAGE: [False, True, True, True]
ROI_HEADS:
NAME: "CascadeROIHeads"
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_CONV: 4
NUM_FC: 1
NORM: "GN"
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
NUM_CONV: 8
NORM: "GN"
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
IMS_PER_BATCH: 128
STEPS: (35000, 45000)
MAX_ITER: 50000
BASE_LR: 0.16
INPUT:
MIN_SIZE_TRAIN: (640, 864)
MIN_SIZE_TRAIN_SAMPLING: "range"
MAX_SIZE_TRAIN: 1440
CROP:
ENABLED: True
TEST:
EVAL_PERIOD: 2500

View File

@ -0,0 +1,10 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: True

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
DEFORM_MODULATED: False

View File

@ -0,0 +1,11 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
DEFORM_MODULATED: False
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,21 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
MASK_ON: True
RESNETS:
DEPTH: 50
NORM: "GN"
STRIDE_IN_1X1: False
FPN:
NORM: "GN"
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_CONV: 4
NUM_FC: 1
NORM: "GN"
ROI_MASK_HEAD:
NORM: "GN"
SOLVER:
# 3x schedule
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,24 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
NORM: "SyncBN"
STRIDE_IN_1X1: True
FPN:
NORM: "SyncBN"
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_CONV: 4
NUM_FC: 1
NORM: "SyncBN"
ROI_MASK_HEAD:
NORM: "SyncBN"
SOLVER:
# 3x schedule
STEPS: (210000, 250000)
MAX_ITER: 270000
TEST:
PRECISE_BN:
ENABLED: True

View File

@ -0,0 +1,151 @@
# An example config to train a mmdetection model using detectron2.
from ..common.data.coco import dataloader
from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
from ..common.optim import SGD as optimizer
from ..common.train import train
from detectron2.modeling.mmdet_wrapper import MMDetDetector
from detectron2.config import LazyCall as L
model = L(MMDetDetector)(
detector=dict(
type="MaskRCNN",
pretrained="torchvision://resnet50",
backbone=dict(
type="ResNet",
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type="BN", requires_grad=True),
norm_eval=True,
style="pytorch",
),
neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
rpn_head=dict(
type="RPNHead",
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type="AnchorGenerator",
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64],
),
bbox_coder=dict(
type="DeltaXYWHBBoxCoder",
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[1.0, 1.0, 1.0, 1.0],
),
loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type="L1Loss", loss_weight=1.0),
),
roi_head=dict(
type="StandardRoIHead",
bbox_roi_extractor=dict(
type="SingleRoIExtractor",
roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32],
),
bbox_head=dict(
type="Shared2FCBBoxHead",
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type="DeltaXYWHBBoxCoder",
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.1, 0.1, 0.2, 0.2],
),
reg_class_agnostic=False,
loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type="L1Loss", loss_weight=1.0),
),
mask_roi_extractor=dict(
type="SingleRoIExtractor",
roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32],
),
mask_head=dict(
type="FCNMaskHead",
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=80,
loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0),
),
),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type="MaxIoUAssigner",
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1,
),
sampler=dict(
type="RandomSampler",
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False,
),
allowed_border=-1,
pos_weight=-1,
debug=False,
),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=1000,
nms=dict(type="nms", iou_threshold=0.7),
min_bbox_size=0,
),
rcnn=dict(
assigner=dict(
type="MaxIoUAssigner",
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=True,
ignore_iof_thr=-1,
),
sampler=dict(
type="RandomSampler",
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True,
),
mask_size=28,
pos_weight=-1,
debug=False,
),
),
test_cfg=dict(
rpn=dict(
nms_pre=1000,
max_per_img=1000,
nms=dict(type="nms", iou_threshold=0.7),
min_bbox_size=0,
),
rcnn=dict(
score_thr=0.05,
nms=dict(type="nms", iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5,
),
),
),
pixel_mean=[123.675, 116.280, 103.530],
pixel_std=[58.395, 57.120, 57.375],
)
dataloader.train.mapper.image_format = "RGB" # torchvision pretrained model
train.init_checkpoint = None # pretrained model is loaded inside backbone

View File

@ -0,0 +1,26 @@
# A large PanopticFPN for demo purposes.
# Use GN on backbone to support semantic seg.
# Use Cascade + Deform Conv to improve localization.
_BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
RESNETS:
DEPTH: 101
NORM: "GN"
DEFORM_ON_PER_STAGE: [False, True, True, True]
STRIDE_IN_1X1: False
FPN:
NORM: "GN"
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
NORM: "GN"
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
STEPS: (105000, 125000)
MAX_ITER: 135000
IMS_PER_BATCH: 32
BASE_LR: 0.04

View File

@ -0,0 +1,13 @@
_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
MODEL:
# Train from random initialization.
WEIGHTS: ""
# It makes sense to divide by STD when training from scratch
# But it seems to make no difference on the results and C2's models didn't do this.
# So we keep things consistent with C2.
# PIXEL_STD: [57.375, 57.12, 58.395]
MASK_ON: True
BACKBONE:
FREEZE_AT: 0
# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
# to learn what you need for training from scratch.

View File

@ -0,0 +1,19 @@
_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
MODEL:
PIXEL_STD: [57.375, 57.12, 58.395]
WEIGHTS: ""
MASK_ON: True
RESNETS:
STRIDE_IN_1X1: False
BACKBONE:
FREEZE_AT: 0
SOLVER:
# 9x schedule
IMS_PER_BATCH: 64 # 4x the standard
STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k
MAX_ITER: 202500 # 90k * 9 / 4
BASE_LR: 0.08
TEST:
EVAL_PERIOD: 2500
# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
# to learn what you need for training from scratch.

View File

@ -0,0 +1,19 @@
_BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml"
MODEL:
PIXEL_STD: [57.375, 57.12, 58.395]
WEIGHTS: ""
MASK_ON: True
RESNETS:
STRIDE_IN_1X1: False
BACKBONE:
FREEZE_AT: 0
SOLVER:
# 9x schedule
IMS_PER_BATCH: 64 # 4x the standard
STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k
MAX_ITER: 202500 # 90k * 9 / 4
BASE_LR: 0.08
TEST:
EVAL_PERIOD: 2500
# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
# to learn what you need for training from scratch.

View File

@ -0,0 +1,11 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "SemanticSegmentor"
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DATASETS:
TRAIN: ("coco_2017_train_panoptic_stuffonly",)
TEST: ("coco_2017_val_panoptic_stuffonly",)
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)

View File

@ -0,0 +1,150 @@
"""
An example config file to train a ImageNet classifier with detectron2.
Model and dataloader both come from torchvision.
This shows how to use detectron2 as a general engine for any new models and tasks.
To run, use the following command:
python tools/lazyconfig_train_net.py --config-file configs/Misc/torchvision_imagenet_R_50.py \
--num-gpus 8 dataloader.train.dataset.root=/path/to/imagenet/
"""
import torch
from torch import nn
from torch.nn import functional as F
from omegaconf import OmegaConf
import torchvision
from torchvision.transforms import transforms as T
from torchvision.models.resnet import ResNet, Bottleneck
from fvcore.common.param_scheduler import MultiStepParamScheduler
from detectron2.solver import WarmupParamScheduler
from detectron2.solver.build import get_default_optimizer_params
from detectron2.config import LazyCall as L
from detectron2.model_zoo import get_config
from detectron2.data.samplers import TrainingSampler, InferenceSampler
from detectron2.evaluation import DatasetEvaluator
from detectron2.utils import comm
"""
Note: Here we put reusable code (models, evaluation, data) together with configs just as a
proof-of-concept, to easily demonstrate what's needed to train a ImageNet classifier in detectron2.
Writing code in configs offers extreme flexibility but is often not a good engineering practice.
In practice, you might want to put code in your project and import them instead.
"""
def build_data_loader(dataset, batch_size, num_workers, training=True):
return torch.utils.data.DataLoader(
dataset,
sampler=(TrainingSampler if training else InferenceSampler)(len(dataset)),
batch_size=batch_size,
num_workers=num_workers,
pin_memory=True,
)
class ClassificationNet(nn.Module):
def __init__(self, model: nn.Module):
super().__init__()
self.model = model
@property
def device(self):
return list(self.model.parameters())[0].device
def forward(self, inputs):
image, label = inputs
pred = self.model(image.to(self.device))
if self.training:
label = label.to(self.device)
return F.cross_entropy(pred, label)
else:
return pred
class ClassificationAcc(DatasetEvaluator):
def reset(self):
self.corr = self.total = 0
def process(self, inputs, outputs):
image, label = inputs
self.corr += (outputs.argmax(dim=1).cpu() == label.cpu()).sum().item()
self.total += len(label)
def evaluate(self):
all_corr_total = comm.all_gather([self.corr, self.total])
corr = sum(x[0] for x in all_corr_total)
total = sum(x[1] for x in all_corr_total)
return {"accuracy": corr / total}
# --- End of code that could be in a project and be imported
dataloader = OmegaConf.create()
dataloader.train = L(build_data_loader)(
dataset=L(torchvision.datasets.ImageNet)(
root="/path/to/imagenet",
split="train",
transform=L(T.Compose)(
transforms=[
L(T.RandomResizedCrop)(size=224),
L(T.RandomHorizontalFlip)(),
T.ToTensor(),
L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
]
),
),
batch_size=256 // 8,
num_workers=4,
training=True,
)
dataloader.test = L(build_data_loader)(
dataset=L(torchvision.datasets.ImageNet)(
root="${...train.dataset.root}",
split="val",
transform=L(T.Compose)(
transforms=[
L(T.Resize)(size=256),
L(T.CenterCrop)(size=224),
T.ToTensor(),
L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
]
),
),
batch_size=256 // 8,
num_workers=4,
training=False,
)
dataloader.evaluator = L(ClassificationAcc)()
model = L(ClassificationNet)(
model=(ResNet)(block=Bottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)
)
optimizer = L(torch.optim.SGD)(
params=L(get_default_optimizer_params)(),
lr=0.1,
momentum=0.9,
weight_decay=1e-4,
)
lr_multiplier = L(WarmupParamScheduler)(
scheduler=L(MultiStepParamScheduler)(
values=[1.0, 0.1, 0.01, 0.001], milestones=[30, 60, 90, 100]
),
warmup_length=1 / 100,
warmup_factor=0.1,
)
train = get_config("common/train.py").train
train.init_checkpoint = None
train.max_iter = 100 * 1281167 // 256

View File

@ -0,0 +1,18 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 20
INPUT:
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
MIN_SIZE_TEST: 800
DATASETS:
TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
TEST: ('voc_2007_test',)
SOLVER:
STEPS: (12000, 16000)
MAX_ITER: 18000 # 17.4 epochs
WARMUP_ITERS: 100

View File

@ -0,0 +1,18 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 20
INPUT:
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
MIN_SIZE_TEST: 800
DATASETS:
TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
TEST: ('voc_2007_test',)
SOLVER:
STEPS: (12000, 16000)
MAX_ITER: 18000 # 17.4 epochs
WARMUP_ITERS: 100

View File

@ -0,0 +1,6 @@
This directory provides definitions for a few common models, dataloaders, scheduler,
and optimizers that are often used in training.
The definition of these objects are provided in the form of lazy instantiation:
their arguments can be edited by users before constructing the objects.
They can be imported, or loaded by `model_zoo.get_config` API in users' own configs.

View File

@ -0,0 +1,47 @@
from fvcore.common.param_scheduler import MultiStepParamScheduler
from detectron2.config import LazyCall as L
from detectron2.solver import WarmupParamScheduler
def default_X_scheduler(num_X):
"""
Returns the config for a default multi-step LR scheduler such as "1x", "3x",
commonly referred to in papers, where every 1x has the total length of 1440k
training images (~12 COCO epochs). LR is decayed twice at the end of training
following the strategy defined in "Rethinking ImageNet Pretraining", Sec 4.
Args:
num_X: a positive real number
Returns:
DictConfig: configs that define the multiplier for LR during training
"""
# total number of iterations assuming 16 batch size, using 1440000/16=90000
total_steps_16bs = num_X * 90000
if num_X <= 2:
scheduler = L(MultiStepParamScheduler)(
values=[1.0, 0.1, 0.01],
# note that scheduler is scale-invariant. This is equivalent to
# milestones=[6, 8, 9]
milestones=[60000, 80000, 90000],
)
else:
scheduler = L(MultiStepParamScheduler)(
values=[1.0, 0.1, 0.01],
milestones=[total_steps_16bs - 60000, total_steps_16bs - 20000, total_steps_16bs],
)
return L(WarmupParamScheduler)(
scheduler=scheduler,
warmup_length=1000 / total_steps_16bs,
warmup_method="linear",
warmup_factor=0.001,
)
lr_multiplier_1x = default_X_scheduler(1)
lr_multiplier_2x = default_X_scheduler(2)
lr_multiplier_3x = default_X_scheduler(3)
lr_multiplier_6x = default_X_scheduler(6)
lr_multiplier_9x = default_X_scheduler(9)

View File

@ -0,0 +1,48 @@
from omegaconf import OmegaConf
import detectron2.data.transforms as T
from detectron2.config import LazyCall as L
from detectron2.data import (
DatasetMapper,
build_detection_test_loader,
build_detection_train_loader,
get_detection_dataset_dicts,
)
from detectron2.evaluation import COCOEvaluator
dataloader = OmegaConf.create()
dataloader.train = L(build_detection_train_loader)(
dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
mapper=L(DatasetMapper)(
is_train=True,
augmentations=[
L(T.ResizeShortestEdge)(
short_edge_length=(640, 672, 704, 736, 768, 800),
sample_style="choice",
max_size=1333,
),
L(T.RandomFlip)(horizontal=True),
],
image_format="BGR",
use_instance_mask=True,
),
total_batch_size=16,
num_workers=4,
)
dataloader.test = L(build_detection_test_loader)(
dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
mapper=L(DatasetMapper)(
is_train=False,
augmentations=[
L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333),
],
image_format="${...train.mapper.image_format}",
),
num_workers=4,
)
dataloader.evaluator = L(COCOEvaluator)(
dataset_name="${..test.dataset.names}",
)

View File

@ -0,0 +1,13 @@
from detectron2.data.detection_utils import create_keypoint_hflip_indices
from .coco import dataloader
dataloader.train.dataset.min_keypoints = 1
dataloader.train.dataset.names = "keypoints_coco_2017_train"
dataloader.test.dataset.names = "keypoints_coco_2017_val"
dataloader.train.mapper.update(
use_instance_mask=False,
use_keypoint=True,
keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names),
)

View File

@ -0,0 +1,26 @@
from detectron2.config import LazyCall as L
from detectron2.evaluation import (
COCOEvaluator,
COCOPanopticEvaluator,
DatasetEvaluators,
SemSegEvaluator,
)
from .coco import dataloader
dataloader.train.dataset.names = "coco_2017_train_panoptic_separated"
dataloader.train.dataset.filter_empty = False
dataloader.test.dataset.names = "coco_2017_val_panoptic_separated"
dataloader.evaluator = [
L(COCOEvaluator)(
dataset_name="${...test.dataset.names}",
),
L(SemSegEvaluator)(
dataset_name="${...test.dataset.names}",
),
L(COCOPanopticEvaluator)(
dataset_name="${...test.dataset.names}",
),
]

View File

@ -0,0 +1,36 @@
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads
from .mask_rcnn_fpn import model
# arguments that don't exist for Cascade R-CNN
[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
model.roi_heads.update(
_target_=CascadeROIHeads,
box_heads=[
L(FastRCNNConvFCHead)(
input_shape=ShapeSpec(channels=256, height=7, width=7),
conv_dims=[],
fc_dims=[1024, 1024],
)
for k in range(3)
],
box_predictors=[
L(FastRCNNOutputLayers)(
input_shape=ShapeSpec(channels=1024),
test_score_thresh=0.05,
box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
cls_agnostic_bbox_reg=True,
num_classes="${...num_classes}",
)
for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
],
proposal_matchers=[
L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
for th in [0.5, 0.6, 0.7]
],
)

View File

@ -0,0 +1,23 @@
from detectron2.modeling.meta_arch.fcos import FCOS, FCOSHead
from .retinanet import model
model._target_ = FCOS
del model.anchor_generator
del model.box2box_transform
del model.anchor_matcher
del model.input_format
# Use P5 instead of C5 to compute P6/P7
# (Sec 2.2 of https://arxiv.org/abs/2006.09214)
model.backbone.top_block.in_feature = "p5"
model.backbone.top_block.in_channels = 256
# New score threshold determined based on sqrt(cls_score * centerness)
model.test_score_thresh = 0.2
model.test_nms_thresh = 0.6
model.head._target_ = FCOSHead
del model.head.num_anchors
model.head.norm = "GN"

View File

@ -0,0 +1,33 @@
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead
from .mask_rcnn_fpn import model
[model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]]
model.roi_heads.update(
num_classes=1,
keypoint_in_features=["p2", "p3", "p4", "p5"],
keypoint_pooler=L(ROIPooler)(
output_size=14,
scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
sampling_ratio=0,
pooler_type="ROIAlignV2",
),
keypoint_head=L(KRCNNConvDeconvUpsampleHead)(
input_shape=ShapeSpec(channels=256, width=14, height=14),
num_keypoints=17,
conv_dims=[512] * 8,
loss_normalizer="visible",
),
)
# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
# 1000 proposals per-image is found to hurt box AP.
# Therefore we increase it to 1500 per-image.
model.proposal_generator.post_nms_topk = (1500, 1000)
# Keypoint AP degrades (though box AP improves) when using plain L1 loss
model.roi_heads.box_predictor.smooth_l1_beta = 0.5

View File

@ -0,0 +1,88 @@
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling.meta_arch import GeneralizedRCNN
from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
from detectron2.modeling.roi_heads import (
FastRCNNOutputLayers,
MaskRCNNConvUpsampleHead,
Res5ROIHeads,
)
model = L(GeneralizedRCNN)(
backbone=L(ResNet)(
stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
stages=L(ResNet.make_default_stages)(
depth=50,
stride_in_1x1=True,
norm="FrozenBN",
),
out_features=["res4"],
),
proposal_generator=L(RPN)(
in_features=["res4"],
head=L(StandardRPNHead)(in_channels=1024, num_anchors=15),
anchor_generator=L(DefaultAnchorGenerator)(
sizes=[[32, 64, 128, 256, 512]],
aspect_ratios=[0.5, 1.0, 2.0],
strides=[16],
offset=0.0,
),
anchor_matcher=L(Matcher)(
thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
),
box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
batch_size_per_image=256,
positive_fraction=0.5,
pre_nms_topk=(12000, 6000),
post_nms_topk=(2000, 1000),
nms_thresh=0.7,
),
roi_heads=L(Res5ROIHeads)(
num_classes=80,
batch_size_per_image=512,
positive_fraction=0.25,
proposal_matcher=L(Matcher)(
thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
),
in_features=["res4"],
pooler=L(ROIPooler)(
output_size=14,
scales=(1.0 / 16,),
sampling_ratio=0,
pooler_type="ROIAlignV2",
),
res5=L(ResNet.make_stage)(
block_class=BottleneckBlock,
num_blocks=3,
stride_per_block=[2, 1, 1],
in_channels=1024,
bottleneck_channels=512,
out_channels=2048,
norm="FrozenBN",
stride_in_1x1=True,
),
box_predictor=L(FastRCNNOutputLayers)(
input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1),
test_score_thresh=0.05,
box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
num_classes="${..num_classes}",
),
mask_head=L(MaskRCNNConvUpsampleHead)(
input_shape=L(ShapeSpec)(
channels="${...res5.out_channels}",
width="${...pooler.output_size}",
height="${...pooler.output_size}",
),
num_classes="${..num_classes}",
conv_dims=[256],
),
),
pixel_mean=[103.530, 116.280, 123.675],
pixel_std=[1.0, 1.0, 1.0],
input_format="BGR",
)

View File

@ -0,0 +1,93 @@
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling.meta_arch import GeneralizedRCNN
from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
from detectron2.modeling.backbone.fpn import LastLevelMaxPool
from detectron2.modeling.backbone import BasicStem, FPN, ResNet
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
from detectron2.modeling.roi_heads import (
StandardROIHeads,
FastRCNNOutputLayers,
MaskRCNNConvUpsampleHead,
FastRCNNConvFCHead,
)
model = L(GeneralizedRCNN)(
backbone=L(FPN)(
bottom_up=L(ResNet)(
stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
stages=L(ResNet.make_default_stages)(
depth=50,
stride_in_1x1=True,
norm="FrozenBN",
),
out_features=["res2", "res3", "res4", "res5"],
),
in_features="${.bottom_up.out_features}",
out_channels=256,
top_block=L(LastLevelMaxPool)(),
),
proposal_generator=L(RPN)(
in_features=["p2", "p3", "p4", "p5", "p6"],
head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
anchor_generator=L(DefaultAnchorGenerator)(
sizes=[[32], [64], [128], [256], [512]],
aspect_ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64],
offset=0.0,
),
anchor_matcher=L(Matcher)(
thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
),
box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
batch_size_per_image=256,
positive_fraction=0.5,
pre_nms_topk=(2000, 1000),
post_nms_topk=(1000, 1000),
nms_thresh=0.7,
),
roi_heads=L(StandardROIHeads)(
num_classes=80,
batch_size_per_image=512,
positive_fraction=0.25,
proposal_matcher=L(Matcher)(
thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
),
box_in_features=["p2", "p3", "p4", "p5"],
box_pooler=L(ROIPooler)(
output_size=7,
scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
sampling_ratio=0,
pooler_type="ROIAlignV2",
),
box_head=L(FastRCNNConvFCHead)(
input_shape=ShapeSpec(channels=256, height=7, width=7),
conv_dims=[],
fc_dims=[1024, 1024],
),
box_predictor=L(FastRCNNOutputLayers)(
input_shape=ShapeSpec(channels=1024),
test_score_thresh=0.05,
box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
num_classes="${..num_classes}",
),
mask_in_features=["p2", "p3", "p4", "p5"],
mask_pooler=L(ROIPooler)(
output_size=14,
scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
sampling_ratio=0,
pooler_type="ROIAlignV2",
),
mask_head=L(MaskRCNNConvUpsampleHead)(
input_shape=ShapeSpec(channels=256, width=14, height=14),
num_classes="${..num_classes}",
conv_dims=[256, 256, 256, 256, 256],
),
),
pixel_mean=[103.530, 116.280, 123.675],
pixel_std=[1.0, 1.0, 1.0],
input_format="BGR",
)

View File

@ -0,0 +1,20 @@
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling import PanopticFPN
from detectron2.modeling.meta_arch.semantic_seg import SemSegFPNHead
from .mask_rcnn_fpn import model
model._target_ = PanopticFPN
model.sem_seg_head = L(SemSegFPNHead)(
input_shape={
f: L(ShapeSpec)(stride=s, channels="${....backbone.out_channels}")
for f, s in zip(["p2", "p3", "p4", "p5"], [4, 8, 16, 32])
},
ignore_value=255,
num_classes=54, # COCO stuff + 1
conv_dims=128,
common_stride=4,
loss_weight=0.5,
norm="GN",
)

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
from detectron2.config import LazyCall as L
from detectron2.layers import ShapeSpec
from detectron2.modeling.meta_arch import RetinaNet
from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
from detectron2.modeling.backbone.fpn import LastLevelP6P7
from detectron2.modeling.backbone import BasicStem, FPN, ResNet
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.meta_arch.retinanet import RetinaNetHead
model = L(RetinaNet)(
backbone=L(FPN)(
bottom_up=L(ResNet)(
stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
stages=L(ResNet.make_default_stages)(
depth=50,
stride_in_1x1=True,
norm="FrozenBN",
),
out_features=["res3", "res4", "res5"],
),
in_features=["res3", "res4", "res5"],
out_channels=256,
top_block=L(LastLevelP6P7)(in_channels=2048, out_channels="${..out_channels}"),
),
head=L(RetinaNetHead)(
# Shape for each input feature map
input_shape=[ShapeSpec(channels=256)] * 5,
num_classes="${..num_classes}",
conv_dims=[256, 256, 256, 256],
prior_prob=0.01,
num_anchors=9,
),
anchor_generator=L(DefaultAnchorGenerator)(
sizes=[[x, x * 2 ** (1.0 / 3), x * 2 ** (2.0 / 3)] for x in [32, 64, 128, 256, 512]],
aspect_ratios=[0.5, 1.0, 2.0],
strides=[8, 16, 32, 64, 128],
offset=0.0,
),
box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
anchor_matcher=L(Matcher)(
thresholds=[0.4, 0.5], labels=[0, -1, 1], allow_low_quality_matches=True
),
num_classes=80,
head_in_features=["p3", "p4", "p5", "p6", "p7"],
focal_loss_alpha=0.25,
focal_loss_gamma=2.0,
pixel_mean=[103.530, 116.280, 123.675],
pixel_std=[1.0, 1.0, 1.0],
input_format="BGR",
)

View File

@ -0,0 +1,15 @@
import torch
from detectron2.config import LazyCall as L
from detectron2.solver.build import get_default_optimizer_params
SGD = L(torch.optim.SGD)(
params=L(get_default_optimizer_params)(
# params.model is meant to be set to the model object, before instantiating
# the optimizer.
weight_decay_norm=0.0
),
lr=0.02,
momentum=0.9,
weight_decay=1e-4,
)

Some files were not shown because too many files have changed in this diff Show More