uodate training code

2024-03-19 10:31:00 +08:00 · 2024-03-19 10:31:00 +08:00 · 7a021a077b
parent c16a4e7999
commit 7a021a077b
572 changed files with 75428 additions and 810 deletions
--- a/README.md
+++ b/README.md
@ -1,10 +1,4 @@

-
-
-<div align=center>
-  <img src="assets/images/GLEE_logo.png" width=900 >
-</div>
-
 # GLEE: General Object Foundation Model for Images and Videos at Scale

 > #### Junfeng Wu\*, Yi Jiang\*,  Qihao Liu, Zehuan Yuan, Xiang Bai<sup>&dagger;</sup>,and Song Bai<sup>&dagger;</sup>
@ -13,37 +7,18 @@

 \[[Project Page](https://glee-vision.github.io/)\]  \[[Paper](https://arxiv.org/abs/2312.09158)\]    \[[HuggingFace Demo](https://huggingface.co/spaces/Junfeng5/GLEE_demo)\]   \[[Video Demo](https://youtu.be/PSVhfTPx0GQ)\]  

-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/long-tail-video-object-segmentation-on-burst-1)](https://paperswithcode.com/sota/long-tail-video-object-segmentation-on-burst-1?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-ovis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-ovis-1?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-video-object-segmentation-on-refer)](https://paperswithcode.com/sota/referring-video-object-segmentation-on-refer?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/multi-object-tracking-on-tao)](https://paperswithcode.com/sota/multi-object-tracking-on-tao?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/open-world-instance-segmentation-on-uvo)](https://paperswithcode.com/sota/open-world-instance-segmentation-on-uvo?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcocog)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-youtube-vis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-youtube-vis-1?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/object-detection-on-lvis-v1-0-val)](https://paperswithcode.com/sota/object-detection-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-lvis-v1-0-val)](https://paperswithcode.com/sota/instance-segmentation-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco-3)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-3?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on)](https://paperswithcode.com/sota/referring-expression-comprehension-on?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=general-object-foundation-model-for-images)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco-1)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco-1?p=general-object-foundation-model-for-images)
-
-
-[![IMAGE ALT TEXT](http://img.youtube.com/vi/PSVhfTPx0GQ/0.jpg)](http://www.youtube.com/watch?v=PSVhfTPx0GQ "Video Title")
-
-
-![data_demo](assets/images/data_demo.png)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/long-tail-video-object-segmentation-on-burst-1)](https://paperswithcode.com/sota/long-tail-video-object-segmentation-on-burst-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-ovis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-ovis-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-video-object-segmentation-on-refer)](https://paperswithcode.com/sota/referring-video-object-segmentation-on-refer?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/multi-object-tracking-on-tao)](https://paperswithcode.com/sota/multi-object-tracking-on-tao?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/open-world-instance-segmentation-on-uvo)](https://paperswithcode.com/sota/open-world-instance-segmentation-on-uvo?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcocog)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/video-instance-segmentation-on-youtube-vis-1)](https://paperswithcode.com/sota/video-instance-segmentation-on-youtube-vis-1?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/object-detection-on-lvis-v1-0-val)](https://paperswithcode.com/sota/object-detection-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-lvis-v1-0-val)](https://paperswithcode.com/sota/instance-segmentation-on-lvis-v1-0-val?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-segmentation-on-refcoco-3)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-3?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on)](https://paperswithcode.com/sota/referring-expression-comprehension-on?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=general-object-foundation-model-for-images)[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/general-object-foundation-model-for-images/referring-expression-comprehension-on-refcoco-1)](https://paperswithcode.com/sota/referring-expression-comprehension-on-refcoco-1?p=general-object-foundation-model-for-images)




+![data_demo](assets/images/glee_func.gif)

 ## Highlight:

+- GLEE is accepted by **CVPR2024** !
 - GLEE is a general object foundation model jointly trained on over **ten million images** from various benchmarks with diverse levels of supervision.
- GLEE is capable of addressing **a wide range of object-centric tasks** simultaneously while maintaining state-of-the-art performance.
+- GLEE is capable of addressing **a wide range of object-centric tasks** simultaneously while maintaining **SOTA** performance.
 -  GLEE demonstrates remarkable versatility and robust **zero-shot transferability** across a spectrum of object-level image and video tasks, and able to **serve as a foundational component** for enhancing other architectures or models.


@ -51,10 +26,28 @@
 We will release the following contents for **GLEE**:exclamation:

 - [x] Demo Code
- [x] Model Checkpoint
- [ ] Comprehensive User Guide
- [ ] Training Code
- [ ] Evaluation Code
+
+- [x] Model Zoo
+
+- [x] Comprehensive User Guide
+
+- [x] Training Code and Scripts
+
+- [ ] Detailed Evaluation Code and Scripts
+
+- [ ] Tutorial for Zero-shot Testing or Fine-tuning GLEE on New Datasets
+
+  
+
+
+
+## Getting started
+
+1. Installation: Please refer to [INSTALL.md](assets/INSTALL.md) for more details.
+2. Data preparation: Please refer to [DATA.md](assets/DATA.md) for more details.
+3. Training: Please refer to [TRAIN.md](assets/TRAIN.md) for more details.
+4. Testing: Please refer to [TEST.md](assets/TEST.md) for more details. 
+5. Model zoo: Please refer to [MODEL_ZOO.md](assets/MODEL_ZOO.md) for more details.



@ -64,13 +57,6 @@ Try our online demo app on \[[HuggingFace Demo](https://huggingface.co/spaces/Ju

 ```bash
 git clone https://github.com/FoundationVision/GLEE
-cd GLEE/app/
-pip install -r requirements.txt
-```
-
-Download the pretrain weight for [GLEE-Lite](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE_R50_Scaleup10m.pth?download=true) and [GLEE-Plus](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE_SwinL_Scaleup10m.pth?download=true) 
-
-```
 # support CPU and GPU running
 python app.py
 ```
@ -79,6 +65,16 @@ python app.py

 # Introduction 

+
+
+GLEE has been trained on over ten million images from 16 datasets, fully harnessing both existing annotated data and cost-effective automatically labeled data to construct a diverse training set. This extensive training regime endows GLEE with formidable generalization capabilities. 
+
+
+
+![data_demo](assets/images/data_demo.png)
+
+
+
 GLEE consists of an image encoder, a text encoder, a visual prompter, and an object decoder, as illustrated in Figure. The text encoder processes arbitrary descriptions related to the task, including **1) object category list 2）object names in any form 3）captions about objects 4）referring expressions**. The visual prompter encodes user inputs such as **1) points 2) bounding boxes 3) scribbles** during interactive segmentation into corresponding visual representations of target objects. Then they are integrated into a detector for extracting objects from images according to textual and visual input.

 ![pipeline](assets/images/pipeline.png)
@ -117,3 +113,14 @@ Based on the above designs, GLEE can be used to seamlessly unify a wide range of
 }
 ```

+## Acknowledgments
+
+- Thanks [UNINEXT](https://github.com/MasterBin-IIAU/UNINEXT) for the implementation of multi-dataset training and data processing.
+
+- Thanks [VNext](https://github.com/wjf5203/VNext) for providing experience of Video Instance Segmentation (VIS).
+
+- Thanks [SEEM](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once) for providing the implementation of the visual prompter.
+
+- Thanks [MaskDINO](https://github.com/IDEA-Research/MaskDINO) for providing a powerful detector and segmenter.
+
+  
--- a/app/app.py
+++ b/app/app.py
@ -12,13 +12,13 @@ import cv2
 import torch

 from detectron2.config import get_cfg
-from GLEE.glee.models.glee_model import GLEE_Model
-from GLEE.glee.config_deeplab import add_deeplab_config
-from GLEE.glee.config import add_glee_config
+from projects.GLEE.glee.models.glee_model import GLEE_Model
+from projects.GLEE.glee.config import add_glee_config
+# from projects.GLEE import GLEE
 import torch.nn.functional as F
 import torchvision
 import math
-from obj365_name import categories as OBJ365_CATEGORIESV2
+from projects.GLEE.glee.data.datasets.objects365_v2 import categories as OBJ365_CATEGORIESV2


 print(f"Is CUDA available: {torch.cuda.is_available()}")
@ -80,10 +80,9 @@ else:
    device='cpu'

 cfg_r50 = get_cfg()
-add_deeplab_config(cfg_r50)
 add_glee_config(cfg_r50)
-conf_files_r50 = 'GLEE/configs/R50.yaml'
-checkpoints_r50 = torch.load('GLEE_R50_Scaleup10m.pth') 
+conf_files_r50 = 'projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml'
+checkpoints_r50 = torch.load('GLEE_Lite_joint.pth') 
 cfg_r50.merge_from_file(conf_files_r50)
 GLEEmodel_r50 = GLEE_Model(cfg_r50, None, device, None, True).to(device)
 GLEEmodel_r50.load_state_dict(checkpoints_r50, strict=False)
@ -91,10 +90,9 @@ GLEEmodel_r50.eval()


 cfg_swin = get_cfg()
-add_deeplab_config(cfg_swin)
 add_glee_config(cfg_swin)
-conf_files_swin = 'GLEE/configs/SwinL.yaml'
-checkpoints_swin = torch.load('GLEE_SwinL_Scaleup10m.pth') 
+conf_files_swin = 'projects/GLEE/configs/images/Plus/Stage2_joint_training_CLIPteacher_SwinL.yaml'
+checkpoints_swin = torch.load('GLEE_Plus_joint.pth') 
 cfg_swin.merge_from_file(conf_files_swin)
 GLEEmodel_swin = GLEE_Model(cfg_swin, None, device, None, True).to(device)
 GLEEmodel_swin.load_state_dict(checkpoints_swin, strict=False)
@ -157,7 +155,7 @@ def segment_image(img,prompt_mode, categoryname, custom_category, expressiong, r
            if  categoryname =="COCO-80":
                batch_category_name = coco_class_name
            elif categoryname =="OBJ365":
-                batch_category_name = obj365_class_name
+                batch_category_name = OBJ365_class_names
            elif categoryname =="Custom-List":
                batch_category_name = custom_category.split(',')
            else:
--- a/app/GLEE/glee/config_deeplab.py
+++ b/app/GLEE/glee/config_deeplab.py
@ -1,28 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-
-def add_deeplab_config(cfg):
-    """
-    Add config for DeepLab.
-    """
-    # We retry random cropping until no single category in semantic segmentation GT occupies more
-    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
-    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
-    # Used for `poly` learning rate schedule.
-    cfg.SOLVER.POLY_LR_POWER = 0.9
-    cfg.SOLVER.POLY_LR_CONSTANT_ENDING = 0.0
-    # Loss type, choose from `cross_entropy`, `hard_pixel_mining`.
-    cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE = "hard_pixel_mining"
-    # DeepLab settings
-    cfg.MODEL.SEM_SEG_HEAD.PROJECT_FEATURES = ["res2"]
-    cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS = [48]
-    cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS = 256
-    cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS = [6, 12, 18]
-    cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT = 0.1
-    cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV = False
-    # Backbone new configs
-    cfg.MODEL.RESNETS.RES4_DILATION = 1
-    cfg.MODEL.RESNETS.RES5_MULTI_GRID = [1, 2, 4]
-    # ResNet stem type from: `basic`, `deeplab`
-    cfg.MODEL.RESNETS.STEM_TYPE = "deeplab"
--- a/app/GLEE/glee/models/glee_model.py
+++ b/app/GLEE/glee/models/glee_model.py
@ -1,296 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-"""
-"""
-import torch
-import torch.nn.functional as F
-from torch import nn
-# from ..backbone import build_backbone, Backbone
-# from ..body.encoder import build_encoder
-# from ..body.decoder import build_decoder
-
-from detectron2.modeling import  build_backbone
-
-from .pixel_decoder.maskdino_encoder import build_pixel_decoder
-from .transformer_decoder.maskdino_decoder import build_transformer_decoder
-
-import random
-from transformers import AutoTokenizer
-from collections import OrderedDict
-from ..modules.point_features import point_sample
-from timm.models.layers import trunc_normal_
-from transformers import CLIPTokenizer,CLIPTextModel
-from .vos_utils import masks_to_boxes, FeatureFuser
-import numpy as np 
-import math
-
-
-def rand_sample(x, max_len):
-    if x.shape[1] <= max_len:
-        return x
-    else:
-        rand_idx = torch.randperm(x.shape[1])[:max_len]
-        return x[:,rand_idx]
-
-
-def agg_lang_feat(features, mask, pool_type="average"):
-    """average pooling of language features"""
-    # feat: (bs, seq_len, C)
-    # mask: (bs, seq_len)
-    if pool_type == "average":
-        embedded = features * mask.unsqueeze(-1).float() # use mask to zero out invalid token features
-        aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float())
-    elif pool_type == "max":
-        out = []
-        for i in range(len(features)):
-            pool_feat, _ = torch.max(features[i][mask[i]], 0) # (L, C) -> (C, )
-            out.append(pool_feat)
-        aggregate = torch.stack(out, dim=0) # (bs, C)
-    else:
-        raise ValueError("pool_type should be average or max")
-    return aggregate
-
-class GLEE_Model(nn.Module):
-    """
-    Main class for mask classification semantic segmentation architectures.
-    """
-    def __init__(self, cfg, matcher, device, video_info, contras_mean):
-        super().__init__()
-        self.cfg = cfg
-        self.matcher = matcher
-        self.backbone = build_backbone(cfg)
-        output_channels = [v for k,v in self.backbone._out_feature_channels.items()]
-        self.sot_fuser = FeatureFuser(output_channels[-3:], 256)
-        
-       
-        self.tokenizer = CLIPTokenizer.from_pretrained('GLEE/clip_vit_base_patch32') 
-        self.tokenizer.add_special_tokens({'cls_token': self.tokenizer.eos_token})
-        self.text_encoder = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
-        # self.text_encoder_teacher = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32')
-        self.lang_encoder = None
-        # for p in self.text_encoder_teacher.parameters():
-            # p.requires_grad = False
-        self.lang_projection = nn.Parameter(torch.rand(cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM, cfg.MODEL.DIM_PROJ))
-        self.text_encode_type = 'clip_teacher'
-        
-        # self.lang_encoder = None     
-        self.pixel_decoder = build_pixel_decoder(cfg, self.backbone.output_shape())
-        transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
-        self.predictor = build_transformer_decoder(cfg, transformer_predictor_in_channels, lang_encoder = self.lang_encoder, mask_classification=True,)
-        self.to(device)
-        
-        self.video_info = video_info
-        self.contras_mean = contras_mean
-
-        self.track_loss_version = cfg.MODEL.TRACK_VERSION
-
-        self.no_mask_tasks = ['obj365', 'obj365_clip','openimage', 'openimage_clip', 'vg', 'grit', 'bdd_det', 'bdd_track_box'] 
-
-
-        # for visual prompt
-        hidden_dim = 256
-        self.max_spatial_len = [512,512,512,512]
-        self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(4)])
-        trunc_normal_(self.mask_sptial_embed[0], std=.02)
-        trunc_normal_(self.mask_sptial_embed[1], std=.02)
-        trunc_normal_(self.mask_sptial_embed[2], std=.02)
-        trunc_normal_(self.mask_sptial_embed[3], std=.02)
-        # learnable positive negative indicator
-        self.pn_indicator = nn.Embedding(2, hidden_dim)
-
-    @property
-    def device(self):
-        return self.pixel_mean.device
-    
-    def forward(self, images, prompts, task, targets=None, batch_name_list=None, is_train = True, visual_prompt_type='scribble'):
-        extra =  {}
-        # dist_loss = None
-        early_semantic = None
-
-        if self.text_encode_type == "clip_teacher":
-            if task not in ['grounding','rvos']:
-                assert batch_name_list
-                calsses_name_list = batch_name_list
-                tokenized = self.tokenizer.batch_encode_plus(calsses_name_list,
-                        max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, # 256
-                        padding='max_length' if self.cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX else "longest", # max_length
-                        return_special_tokens_mask=True,
-                        return_tensors='pt',
-                        truncation=True).to(images.device)
-                texts = (tokenized['input_ids'], tokenized['attention_mask'])
-                token_x = self.text_encoder(*texts)['last_hidden_state']
-
-                valid_mask = tokenized['attention_mask'].bool()
-                # token_x_teacher = self.text_encoder_teacher(*texts)['last_hidden_state']
-                # if is_train:
-                # dist_loss =  F.mse_loss(token_x[valid_mask], token_x_teacher[valid_mask] )
-                    # F.l2_loss(token_x[valid_mask], token_x_teacher[valid_mask] )  
-                token_x = token_x @ self.lang_projection
-                lang_feat_pool = agg_lang_feat(token_x, tokenized['attention_mask'], pool_type="average")  # (bs,  768)
-                extra['class_embeddings'] = lang_feat_pool 
-                if True: # early_fusion
-                    gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
-                    gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
-                    gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0  #[bs,L]
-                    early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask} 
-
-
-        if 'grounding' in prompts:
- 
-            if self.text_encode_type == 'clip_frozen' or self.text_encode_type == 'clip_teacher':
-
-                tokens = self.tokenizer(
-                    prompts['grounding'], padding='max_length', truncation=True, max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, return_tensors='pt'
-                    )
-                tokens = {key: value.to(images.device) for key, value in tokens.items()}
-
-                texts = (tokens['input_ids'], tokens['attention_mask'])
-                x = self.text_encoder(*texts)
-                token_x = x['last_hidden_state']
-                token_x = token_x @ self.lang_projection
-
-                extra['grounding_tokens'] = token_x.permute(1,0,2) #[len,bz,C]
-
-                non_zero_query_mask = tokens['attention_mask']
-                lang_feat_pool = agg_lang_feat(token_x, non_zero_query_mask, pool_type="average").unsqueeze(1) # (bs, 1, 768)
-
-                dist_loss =  (lang_feat_pool*0).sum()
-                
-                extra['grounding_nonzero_mask'] = ~non_zero_query_mask.bool()  # [bz,len]
-                extra['grounding_class'] = lang_feat_pool.squeeze(1) #[bz,C
-                # gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0]
-                # gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C]
-                # gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0  #[bs,L]
-                # early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask} 
-                early_semantic = {"hidden":token_x.float(),"masks":tokens['attention_mask']>0} 
-        
-
-        if isinstance(images,torch.Tensor):
-            features = self.backbone(images)
-        else:
-            features = self.backbone(images.tensor)
-
-
-
-
-        if 'spatial' in prompts:
-            ## setp 1,2,3
-            key_images = [ images ]  #bz*[1,3,H,W]
-            key_promptmasks = [m.unsqueeze(0) for m in prompts['spatial']] #bz*[1,1,H,W]
-
-            prompt_mode = visual_prompt_type            
-            ref_feats, ref_masks = self.get_template(key_images, key_promptmasks, prompt_mode) 
-            early_fusion = {"hidden":ref_feats,"masks":ref_masks} 
-            if early_semantic is None:
-                early_semantic = early_fusion
-            else:
-                early_semantic["hidden"] = torch.cat([early_semantic["hidden"],early_fusion["hidden"]],dim=1)
-                early_semantic["masks"] = torch.cat([early_semantic["masks"],early_fusion["masks"]],dim=1)
-
-        
-        # bz = len(images)//2
-        mask_features, _, multi_scale_features, zero_loss = self.pixel_decoder.forward_features(features, masks=None, early_fusion = early_semantic)
-        if 'spatial' in prompts:
-            pos_masks = prompts['spatial']
-            # neg_masks = [~p for p in prompts['spatial']]
-            neg_masks = [p&False for p in prompts['spatial']]
-            
-            extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks})
-
-
-            _,h,w = extra['spatial_query_pos_mask'][0].shape
-            divisor = torch.tensor([h,w], device=mask_features.device)[None,]
-            # Get mean pos spatial query
-            non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']]
-            non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2)  
-            non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0)  
-            spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True) #[(N, C, P)
-            spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num() # [1,bz,C]
-            # Get mean neg spatial query
-            non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']]
-            non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2)
-            non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0)
-            spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True)
-            spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num()
-
-            # Get layerwise spatial query
-            src_spatial_queries = []
-            src_spatial_maskings = []
-            for i in range(len(multi_scale_features)):
-                bs,dc,h,w = multi_scale_features[i].shape
-                # src_mask_features = multi_scale_features[i].view(h,w,bs,dc)
-                src_mask_features = multi_scale_features[i].permute(2,3,0,1)
-                src_mask_features = src_mask_features @ self.mask_sptial_embed[i]
-
-                non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']]
-                non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']]
-                non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
-                pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)]
-                pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0)
-                non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2)
-                non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0)
-                non_zero_query_point[non_zero_query_mask] = 0
-
-                spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1)
-                spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1]
-                spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2]
-
-                src_spatial_queries += [spatial_tokens]
-                src_spatial_maskings += [non_zero_query_mask]
-
-            extra['visual_prompt_tokens'] = src_spatial_queries #[len,bz,C]
-            extra['visual_prompt_nonzero_mask'] = src_spatial_maskings  # [bz,len]
-        
- 
-        outputs = self.predictor(multi_scale_features, mask_features, extra=extra, task=task, masks=None, targets=targets)
-        return  outputs 
- 
-
-
-
-     
-
-    def get_template(self, imgs, pad_masks, prompt_mode='scribble'):
-        """img: (N, 3, H, W), mask: (N, 1, H, W), bbox: (1, 4)"""
-        """get 4-channel template"""
-
-        croped_img_with_mask = []
-
-        for image_i, mask_i in zip( imgs, pad_masks):
-
-            if prompt_mode in ['scribble','point']:
-                image_with_mask = image_i + mask_i.to(image_i)
-            else:
-                image_with_mask = image_i 
-
-            # image_with_mask = torch.cat([image_i,mask_i.to(image_i)],dim=1) #[1,3,H,W]
-            box_i = masks_to_boxes(mask_i[0])  #[xyxy]
-            box_i[:, 2:] = box_i[:, 2:] - box_i[:, :2] #xywh
-            
-
-            x, y, w, h = box_i[0].long().tolist()
-
-            self.search_area_factor=2
-
-            crop_sz = math.ceil(math.sqrt(w * h) * self.search_area_factor)
-            x1 = max(0,round(x + 0.5 * w - crop_sz * 0.5))
-            x2 = x1 + crop_sz
-            y1 = max(0,round(y + 0.5 * h - crop_sz * 0.5))
-            y2 = y1 + crop_sz
-
-            im_crop = image_with_mask[:, :, y1:y2, x1:x2]
-            # resize
-            if im_crop.shape[-1] ==0 or im_crop.shape[-2] ==0 :
-                im_crop = image_with_mask
-            im_crop = F.interpolate(im_crop, (256,256), mode='bilinear', align_corners=False)
-            croped_img_with_mask.append(im_crop)
-        croped_img_with_mask = torch.cat(croped_img_with_mask,dim=0) #[bz,3,256,256]
-        with torch.no_grad():
-            ref_srcs = self.backbone(croped_img_with_mask.contiguous())
-        ref_srcs = [v for k,v in ref_srcs.items()]
-        ref_feats = self.sot_fuser(ref_srcs[1:]).float() #[bz,256,32,32]
-
-        ref_feats = ref_feats.flatten(-2).permute(0, 2, 1) # (bs, L, C)
-        ref_masks = torch.ones_like(ref_feats[:,:,0])>0  #[bs,L]
-        
-        return ref_feats, ref_masks
-
--- a/app/obj365_name.py
+++ b/app/obj365_name.py
@ -1,367 +0,0 @@
-categories = [
-    {'id': 1, 'name': 'Person'}, 
-    {'id': 2, 'name': 'Sneakers'}, 
-    {'id': 3, 'name': 'Chair'}, 
-    {'id': 4, 'name': 'Other Shoes'}, 
-    {'id': 5, 'name': 'Hat'}, 
-    {'id': 6, 'name': 'Car'}, 
-    {'id': 7, 'name': 'Lamp'}, 
-    {'id': 8, 'name': 'Glasses'}, 
-    {'id': 9, 'name': 'Bottle'}, 
-    {'id': 10, 'name': 'Desk'}, 
-    {'id': 11, 'name': 'Cup'}, 
-    {'id': 12, 'name': 'Street Lights'}, 
-    {'id': 13, 'name': 'Cabinet/shelf'}, 
-    {'id': 14, 'name': 'Handbag/Satchel'}, 
-    {'id': 15, 'name': 'Bracelet'}, 
-    {'id': 16, 'name': 'Plate'}, 
-    {'id': 17, 'name': 'Picture/Frame'}, 
-    {'id': 18, 'name': 'Helmet'}, 
-    {'id': 19, 'name': 'Book'}, 
-    {'id': 20, 'name': 'Gloves'}, 
-    {'id': 21, 'name': 'Storage box'}, 
-    {'id': 22, 'name': 'Boat'}, 
-    {'id': 23, 'name': 'Leather Shoes'}, 
-    {'id': 24, 'name': 'Flower'}, 
-    {'id': 25, 'name': 'Bench'}, 
-    {'id': 26, 'name': 'Potted Plant'}, 
-    {'id': 27, 'name': 'Bowl/Basin'}, 
-    {'id': 28, 'name': 'Flag'}, 
-    {'id': 29, 'name': 'Pillow'}, 
-    {'id': 30, 'name': 'Boots'}, 
-    {'id': 31, 'name': 'Vase'}, 
-    {'id': 32, 'name': 'Microphone'}, 
-    {'id': 33, 'name': 'Necklace'}, 
-    {'id': 34, 'name': 'Ring'}, 
-    {'id': 35, 'name': 'SUV'}, 
-    {'id': 36, 'name': 'Wine Glass'}, 
-    {'id': 37, 'name': 'Belt'}, 
-    {'id': 38, 'name': 'Moniter/TV'}, 
-    {'id': 39, 'name': 'Backpack'}, 
-    {'id': 40, 'name': 'Umbrella'}, 
-    {'id': 41, 'name': 'Traffic Light'}, 
-    {'id': 42, 'name': 'Speaker'}, 
-    {'id': 43, 'name': 'Watch'}, 
-    {'id': 44, 'name': 'Tie'}, 
-    {'id': 45, 'name': 'Trash bin Can'}, 
-    {'id': 46, 'name': 'Slippers'}, 
-    {'id': 47, 'name': 'Bicycle'}, 
-    {'id': 48, 'name': 'Stool'}, 
-    {'id': 49, 'name': 'Barrel/bucket'}, 
-    {'id': 50, 'name': 'Van'}, 
-    {'id': 51, 'name': 'Couch'}, 
-    {'id': 52, 'name': 'Sandals'}, 
-    {'id': 53, 'name': 'Bakset'}, 
-    {'id': 54, 'name': 'Drum'}, 
-    {'id': 55, 'name': 'Pen/Pencil'}, 
-    {'id': 56, 'name': 'Bus'}, 
-    {'id': 57, 'name': 'Wild Bird'}, 
-    {'id': 58, 'name': 'High Heels'}, 
-    {'id': 59, 'name': 'Motorcycle'}, 
-    {'id': 60, 'name': 'Guitar'}, 
-    {'id': 61, 'name': 'Carpet'}, 
-    {'id': 62, 'name': 'Cell Phone'}, 
-    {'id': 63, 'name': 'Bread'}, 
-    {'id': 64, 'name': 'Camera'}, 
-    {'id': 65, 'name': 'Canned'}, 
-    {'id': 66, 'name': 'Truck'}, 
-    {'id': 67, 'name': 'Traffic cone'}, 
-    {'id': 68, 'name': 'Cymbal'}, 
-    {'id': 69, 'name': 'Lifesaver'}, 
-    {'id': 70, 'name': 'Towel'}, 
-    {'id': 71, 'name': 'Stuffed Toy'}, 
-    {'id': 72, 'name': 'Candle'}, 
-    {'id': 73, 'name': 'Sailboat'}, 
-    {'id': 74, 'name': 'Laptop'}, 
-    {'id': 75, 'name': 'Awning'}, 
-    {'id': 76, 'name': 'Bed'}, 
-    {'id': 77, 'name': 'Faucet'}, 
-    {'id': 78, 'name': 'Tent'}, 
-    {'id': 79, 'name': 'Horse'}, 
-    {'id': 80, 'name': 'Mirror'}, 
-    {'id': 81, 'name': 'Power outlet'}, 
-    {'id': 82, 'name': 'Sink'}, 
-    {'id': 83, 'name': 'Apple'}, 
-    {'id': 84, 'name': 'Air Conditioner'}, 
-    {'id': 85, 'name': 'Knife'}, 
-    {'id': 86, 'name': 'Hockey Stick'}, 
-    {'id': 87, 'name': 'Paddle'}, 
-    {'id': 88, 'name': 'Pickup Truck'}, 
-    {'id': 89, 'name': 'Fork'}, 
-    {'id': 90, 'name': 'Traffic Sign'}, 
-    {'id': 91, 'name': 'Ballon'}, 
-    {'id': 92, 'name': 'Tripod'}, 
-    {'id': 93, 'name': 'Dog'}, 
-    {'id': 94, 'name': 'Spoon'}, 
-    {'id': 95, 'name': 'Clock'}, 
-    {'id': 96, 'name': 'Pot'}, 
-    {'id': 97, 'name': 'Cow'}, 
-    {'id': 98, 'name': 'Cake'}, 
-    {'id': 99, 'name': 'Dinning Table'}, 
-    {'id': 100, 'name': 'Sheep'}, 
-    {'id': 101, 'name': 'Hanger'}, 
-    {'id': 102, 'name': 'Blackboard/Whiteboard'}, 
-    {'id': 103, 'name': 'Napkin'}, 
-    {'id': 104, 'name': 'Other Fish'}, 
-    {'id': 105, 'name': 'Orange/Tangerine'}, 
-    {'id': 106, 'name': 'Toiletry'}, 
-    {'id': 107, 'name': 'Keyboard'}, 
-    {'id': 108, 'name': 'Tomato'}, 
-    {'id': 109, 'name': 'Lantern'}, 
-    {'id': 110, 'name': 'Machinery Vehicle'}, 
-    {'id': 111, 'name': 'Fan'}, 
-    {'id': 112, 'name': 'Green Vegetables'}, 
-    {'id': 113, 'name': 'Banana'}, 
-    {'id': 114, 'name': 'Baseball Glove'}, 
-    {'id': 115, 'name': 'Airplane'}, 
-    {'id': 116, 'name': 'Mouse'}, 
-    {'id': 117, 'name': 'Train'}, 
-    {'id': 118, 'name': 'Pumpkin'}, 
-    {'id': 119, 'name': 'Soccer'}, 
-    {'id': 120, 'name': 'Skiboard'}, 
-    {'id': 121, 'name': 'Luggage'}, 
-    {'id': 122, 'name': 'Nightstand'}, 
-    {'id': 123, 'name': 'Tea pot'}, 
-    {'id': 124, 'name': 'Telephone'}, 
-    {'id': 125, 'name': 'Trolley'}, 
-    {'id': 126, 'name': 'Head Phone'}, 
-    {'id': 127, 'name': 'Sports Car'}, 
-    {'id': 128, 'name': 'Stop Sign'}, 
-    {'id': 129, 'name': 'Dessert'}, 
-    {'id': 130, 'name': 'Scooter'}, 
-    {'id': 131, 'name': 'Stroller'}, 
-    {'id': 132, 'name': 'Crane'}, 
-    {'id': 133, 'name': 'Remote'}, 
-    {'id': 134, 'name': 'Refrigerator'}, 
-    {'id': 135, 'name': 'Oven'}, 
-    {'id': 136, 'name': 'Lemon'}, 
-    {'id': 137, 'name': 'Duck'}, 
-    {'id': 138, 'name': 'Baseball Bat'}, 
-    {'id': 139, 'name': 'Surveillance Camera'}, 
-    {'id': 140, 'name': 'Cat'}, 
-    {'id': 141, 'name': 'Jug'}, 
-    {'id': 142, 'name': 'Broccoli'}, 
-    {'id': 143, 'name': 'Piano'}, 
-    {'id': 144, 'name': 'Pizza'}, 
-    {'id': 145, 'name': 'Elephant'}, 
-    {'id': 146, 'name': 'Skateboard'}, 
-    {'id': 147, 'name': 'Surfboard'}, 
-    {'id': 148, 'name': 'Gun'}, 
-    {'id': 149, 'name': 'Skating and Skiing shoes'}, 
-    {'id': 150, 'name': 'Gas stove'}, 
-    {'id': 151, 'name': 'Donut'}, 
-    {'id': 152, 'name': 'Bow Tie'}, 
-    {'id': 153, 'name': 'Carrot'}, 
-    {'id': 154, 'name': 'Toilet'}, 
-    {'id': 155, 'name': 'Kite'}, 
-    {'id': 156, 'name': 'Strawberry'}, 
-    {'id': 157, 'name': 'Other Balls'}, 
-    {'id': 158, 'name': 'Shovel'}, 
-    {'id': 159, 'name': 'Pepper'}, 
-    {'id': 160, 'name': 'Computer Box'}, 
-    {'id': 161, 'name': 'Toilet Paper'}, 
-    {'id': 162, 'name': 'Cleaning Products'}, 
-    {'id': 163, 'name': 'Chopsticks'}, 
-    {'id': 164, 'name': 'Microwave'}, 
-    {'id': 165, 'name': 'Pigeon'}, 
-    {'id': 166, 'name': 'Baseball'}, 
-    {'id': 167, 'name': 'Cutting/chopping Board'}, 
-    {'id': 168, 'name': 'Coffee Table'}, 
-    {'id': 169, 'name': 'Side Table'}, 
-    {'id': 170, 'name': 'Scissors'}, 
-    {'id': 171, 'name': 'Marker'}, 
-    {'id': 172, 'name': 'Pie'}, 
-    {'id': 173, 'name': 'Ladder'}, 
-    {'id': 174, 'name': 'Snowboard'}, 
-    {'id': 175, 'name': 'Cookies'}, 
-    {'id': 176, 'name': 'Radiator'}, 
-    {'id': 177, 'name': 'Fire Hydrant'}, 
-    {'id': 178, 'name': 'Basketball'}, 
-    {'id': 179, 'name': 'Zebra'}, 
-    {'id': 180, 'name': 'Grape'}, 
-    {'id': 181, 'name': 'Giraffe'}, 
-    {'id': 182, 'name': 'Potato'}, 
-    {'id': 183, 'name': 'Sausage'}, 
-    {'id': 184, 'name': 'Tricycle'}, 
-    {'id': 185, 'name': 'Violin'}, 
-    {'id': 186, 'name': 'Egg'}, 
-    {'id': 187, 'name': 'Fire Extinguisher'}, 
-    {'id': 188, 'name': 'Candy'}, 
-    {'id': 189, 'name': 'Fire Truck'}, 
-    {'id': 190, 'name': 'Billards'}, 
-    {'id': 191, 'name': 'Converter'}, 
-    {'id': 192, 'name': 'Bathtub'}, 
-    {'id': 193, 'name': 'Wheelchair'}, 
-    {'id': 194, 'name': 'Golf Club'}, 
-    {'id': 195, 'name': 'Briefcase'}, 
-    {'id': 196, 'name': 'Cucumber'}, 
-    {'id': 197, 'name': 'Cigar/Cigarette '}, 
-    {'id': 198, 'name': 'Paint Brush'}, 
-    {'id': 199, 'name': 'Pear'}, 
-    {'id': 200, 'name': 'Heavy Truck'}, 
-    {'id': 201, 'name': 'Hamburger'}, 
-    {'id': 202, 'name': 'Extractor'}, 
-    {'id': 203, 'name': 'Extention Cord'}, 
-    {'id': 204, 'name': 'Tong'}, 
-    {'id': 205, 'name': 'Tennis Racket'}, 
-    {'id': 206, 'name': 'Folder'}, 
-    {'id': 207, 'name': 'American Football'}, 
-    {'id': 208, 'name': 'earphone'}, 
-    {'id': 209, 'name': 'Mask'}, 
-    {'id': 210, 'name': 'Kettle'}, 
-    {'id': 211, 'name': 'Tennis'}, 
-    {'id': 212, 'name': 'Ship'}, 
-    {'id': 213, 'name': 'Swing'}, 
-    {'id': 214, 'name': 'Coffee Machine'}, 
-    {'id': 215, 'name': 'Slide'}, 
-    {'id': 216, 'name': 'Carriage'}, 
-    {'id': 217, 'name': 'Onion'}, 
-    {'id': 218, 'name': 'Green beans'}, 
-    {'id': 219, 'name': 'Projector'}, 
-    {'id': 220, 'name': 'Frisbee'}, 
-    {'id': 221, 'name': 'Washing Machine/Drying Machine'}, 
-    {'id': 222, 'name': 'Chicken'}, 
-    {'id': 223, 'name': 'Printer'}, 
-    {'id': 224, 'name': 'Watermelon'}, 
-    {'id': 225, 'name': 'Saxophone'}, 
-    {'id': 226, 'name': 'Tissue'}, 
-    {'id': 227, 'name': 'Toothbrush'}, 
-    {'id': 228, 'name': 'Ice cream'}, 
-    {'id': 229, 'name': 'Hotair ballon'}, 
-    {'id': 230, 'name': 'Cello'}, 
-    {'id': 231, 'name': 'French Fries'}, 
-    {'id': 232, 'name': 'Scale'}, 
-    {'id': 233, 'name': 'Trophy'}, 
-    {'id': 234, 'name': 'Cabbage'}, 
-    {'id': 235, 'name': 'Hot dog'}, 
-    {'id': 236, 'name': 'Blender'}, 
-    {'id': 237, 'name': 'Peach'}, 
-    {'id': 238, 'name': 'Rice'}, 
-    {'id': 239, 'name': 'Wallet/Purse'}, 
-    {'id': 240, 'name': 'Volleyball'}, 
-    {'id': 241, 'name': 'Deer'}, 
-    {'id': 242, 'name': 'Goose'}, 
-    {'id': 243, 'name': 'Tape'}, 
-    {'id': 244, 'name': 'Tablet'}, 
-    {'id': 245, 'name': 'Cosmetics'}, 
-    {'id': 246, 'name': 'Trumpet'}, 
-    {'id': 247, 'name': 'Pineapple'}, 
-    {'id': 248, 'name': 'Golf Ball'}, 
-    {'id': 249, 'name': 'Ambulance'}, 
-    {'id': 250, 'name': 'Parking meter'}, 
-    {'id': 251, 'name': 'Mango'}, 
-    {'id': 252, 'name': 'Key'}, 
-    {'id': 253, 'name': 'Hurdle'}, 
-    {'id': 254, 'name': 'Fishing Rod'}, 
-    {'id': 255, 'name': 'Medal'}, 
-    {'id': 256, 'name': 'Flute'}, 
-    {'id': 257, 'name': 'Brush'}, 
-    {'id': 258, 'name': 'Penguin'}, 
-    {'id': 259, 'name': 'Megaphone'}, 
-    {'id': 260, 'name': 'Corn'}, 
-    {'id': 261, 'name': 'Lettuce'}, 
-    {'id': 262, 'name': 'Garlic'}, 
-    {'id': 263, 'name': 'Swan'}, 
-    {'id': 264, 'name': 'Helicopter'}, 
-    {'id': 265, 'name': 'Green Onion'}, 
-    {'id': 266, 'name': 'Sandwich'}, 
-    {'id': 267, 'name': 'Nuts'}, 
-    {'id': 268, 'name': 'Speed Limit Sign'}, 
-    {'id': 269, 'name': 'Induction Cooker'}, 
-    {'id': 270, 'name': 'Broom'}, 
-    {'id': 271, 'name': 'Trombone'}, 
-    {'id': 272, 'name': 'Plum'}, 
-    {'id': 273, 'name': 'Rickshaw'}, 
-    {'id': 274, 'name': 'Goldfish'}, 
-    {'id': 275, 'name': 'Kiwi fruit'}, 
-    {'id': 276, 'name': 'Router/modem'}, 
-    {'id': 277, 'name': 'Poker Card'}, 
-    {'id': 278, 'name': 'Toaster'}, 
-    {'id': 279, 'name': 'Shrimp'}, 
-    {'id': 280, 'name': 'Sushi'}, 
-    {'id': 281, 'name': 'Cheese'}, 
-    {'id': 282, 'name': 'Notepaper'}, 
-    {'id': 283, 'name': 'Cherry'}, 
-    {'id': 284, 'name': 'Pliers'}, 
-    {'id': 285, 'name': 'CD'}, 
-    {'id': 286, 'name': 'Pasta'}, 
-    {'id': 287, 'name': 'Hammer'}, 
-    {'id': 288, 'name': 'Cue'}, 
-    {'id': 289, 'name': 'Avocado'}, 
-    {'id': 290, 'name': 'Hamimelon'}, 
-    {'id': 291, 'name': 'Flask'}, 
-    {'id': 292, 'name': 'Mushroon'}, 
-    {'id': 293, 'name': 'Screwdriver'}, 
-    {'id': 294, 'name': 'Soap'}, 
-    {'id': 295, 'name': 'Recorder'}, 
-    {'id': 296, 'name': 'Bear'}, 
-    {'id': 297, 'name': 'Eggplant'}, 
-    {'id': 298, 'name': 'Board Eraser'}, 
-    {'id': 299, 'name': 'Coconut'}, 
-    {'id': 300, 'name': 'Tape Measur/ Ruler'}, 
-    {'id': 301, 'name': 'Pig'}, 
-    {'id': 302, 'name': 'Showerhead'}, 
-    {'id': 303, 'name': 'Globe'}, 
-    {'id': 304, 'name': 'Chips'}, 
-    {'id': 305, 'name': 'Steak'}, 
-    {'id': 306, 'name': 'Crosswalk Sign'}, 
-    {'id': 307, 'name': 'Stapler'}, 
-    {'id': 308, 'name': 'Campel'}, 
-    {'id': 309, 'name': 'Formula 1 '}, 
-    {'id': 310, 'name': 'Pomegranate'}, 
-    {'id': 311, 'name': 'Dishwasher'}, 
-    {'id': 312, 'name': 'Crab'}, 
-    {'id': 313, 'name': 'Hoverboard'}, 
-    {'id': 314, 'name': 'Meat ball'}, 
-    {'id': 315, 'name': 'Rice Cooker'}, 
-    {'id': 316, 'name': 'Tuba'}, 
-    {'id': 317, 'name': 'Calculator'}, 
-    {'id': 318, 'name': 'Papaya'}, 
-    {'id': 319, 'name': 'Antelope'}, 
-    {'id': 320, 'name': 'Parrot'}, 
-    {'id': 321, 'name': 'Seal'}, 
-    {'id': 322, 'name': 'Buttefly'}, 
-    {'id': 323, 'name': 'Dumbbell'}, 
-    {'id': 324, 'name': 'Donkey'}, 
-    {'id': 325, 'name': 'Lion'}, 
-    {'id': 326, 'name': 'Urinal'}, 
-    {'id': 327, 'name': 'Dolphin'}, 
-    {'id': 328, 'name': 'Electric Drill'}, 
-    {'id': 329, 'name': 'Hair Dryer'}, 
-    {'id': 330, 'name': 'Egg tart'}, 
-    {'id': 331, 'name': 'Jellyfish'}, 
-    {'id': 332, 'name': 'Treadmill'}, 
-    {'id': 333, 'name': 'Lighter'}, 
-    {'id': 334, 'name': 'Grapefruit'}, 
-    {'id': 335, 'name': 'Game board'}, 
-    {'id': 336, 'name': 'Mop'}, 
-    {'id': 337, 'name': 'Radish'}, 
-    {'id': 338, 'name': 'Baozi'}, 
-    {'id': 339, 'name': 'Target'}, 
-    {'id': 340, 'name': 'French'}, 
-    {'id': 341, 'name': 'Spring Rolls'}, 
-    {'id': 342, 'name': 'Monkey'}, 
-    {'id': 343, 'name': 'Rabbit'}, 
-    {'id': 344, 'name': 'Pencil Case'}, 
-    {'id': 345, 'name': 'Yak'}, 
-    {'id': 346, 'name': 'Red Cabbage'}, 
-    {'id': 347, 'name': 'Binoculars'}, 
-    {'id': 348, 'name': 'Asparagus'}, 
-    {'id': 349, 'name': 'Barbell'}, 
-    {'id': 350, 'name': 'Scallop'}, 
-    {'id': 351, 'name': 'Noddles'}, 
-    {'id': 352, 'name': 'Comb'}, 
-    {'id': 353, 'name': 'Dumpling'}, 
-    {'id': 354, 'name': 'Oyster'}, 
-    {'id': 355, 'name': 'Table Teniis paddle'}, 
-    {'id': 356, 'name': 'Cosmetics Brush/Eyeliner Pencil'}, 
-    {'id': 357, 'name': 'Chainsaw'}, 
-    {'id': 358, 'name': 'Eraser'}, 
-    {'id': 359, 'name': 'Lobster'}, 
-    {'id': 360, 'name': 'Durian'}, 
-    {'id': 361, 'name': 'Okra'}, 
-    {'id': 362, 'name': 'Lipstick'}, 
-    {'id': 363, 'name': 'Cosmetics Mirror'}, 
-    {'id': 364, 'name': 'Curling'}, 
-    {'id': 365, 'name': 'Table Tennis '}, 
-    ]
--- a/app/requirements.txt
+++ b/app/requirements.txt
@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.1.0
-torchvision
-scipy
-opencv-python
-timm
-transformers
-einops
--- a/assets/DATA.md
+++ b/assets/DATA.md
@ -0,0 +1,274 @@
+
+
+
+
+
+
+# Data Preparation
+
+**Here details how to prepare all the datasets used in the training and testing stages of GLEE.**
+
+GLEE used the following 16 datasets for joint training, and perform zero-shot evaluation on additional 6 datasets. Among them, for Objects365, RefCOCO series, YouTubeVOS, Ref-YouTubeVOS, and BDD data, we followed UNINEXT for preprocessing. For the preprocessing of these datasets, you can refer to UNINEXT. For users who only want to test or continue fine-tune on part of the datasets, there is no need of downloading all datasets. 
+
+## For Training
+
+
+
+### COCO
+
+Please download [COCO](https://cocodataset.org/#home) from the offical website. We use [train2017.zip](http://images.cocodataset.org/zips/train2017.zip), [train2014.zip](http://images.cocodataset.org/zips/train2014.zip), [val2017.zip](http://images.cocodataset.org/zips/val2017.zip), [test2017.zip](http://images.cocodataset.org/zips/test2017.zip) & [annotations_trainval2017.zip](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), [image_info_test2017.zip](http://images.cocodataset.org/annotations/image_info_test2017.zip). We expect that the data is organized as below.
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- coco
+            -- annotations
+            -- train2017
+            -- train2014
+            -- val2017
+            -- test2017
+```
+
+### LVIS
+
+Please download [LVISv1](https://www.lvisdataset.org/dataset) from the offical website. LVIS uses the COCO 2017 train, validation, and test image sets, so only Annotation needs to be downloaded：[lvis_v1_train.json.zip](https://dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip),  [lvis_v1_val.json.zip](https://dl.fbaipublicfiles.com/LVIS/lvis_v1_val.json.zip), [lvis_v1_minival_inserted_image_name.json](https://huggingface.co/GLIPModel/GLIP/resolve/main/lvis_v1_minival_inserted_image_name.json). We expect that the data is organized as below.
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- lvis
+            -- lvis_v1_train.json
+            -- lvis_v1_val.json
+            -- lvis_v1_minival_inserted_image_name.json
+```
+
+### VisualGenome
+
+Please download [VisualGenome](https://homes.cs.washington.edu/~ranjay/visualgenome/api.html) images from the offical website:  [part 1 (9.2 GB)](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip), [part 2 (5.47 GB)](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip), and download our preprocessed annotation file: [train.json](), [train_from_objects.json]() . We expect that the data is organized as below.
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- visual_genome
+            -- images
+            	-- *.jpg
+            			...
+            -- annotations
+              -- train_from_objects.json
+              -- train.json
+```
+
+
+
+### OpenImages
+
+Please download [OpenImages v6](https://storage.googleapis.com/openimages/web/download_v6.html) images from the offical website, all detection annotations need to be preprocessed into coco format. We expect that the data is organized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- openimages
+            -- detection
+            -- openimages_v6_train_bbox.json
+```
+
+### VIS
+
+Download YouTube-VIS [2019](https://codalab.lisn.upsaclay.fr/competitions/6064#participate-get_data), [2021](https://codalab.lisn.upsaclay.fr/competitions/7680#participate-get_data), [OVIS](https://codalab.lisn.upsaclay.fr/competitions/4763#participate) dataset for video instance segmentation task, and it is necessary to convert their video annotation into coco format in advance for image-level joint-training by run: ```python3 conversion/conver_vis2coco.py```  We expect that the data is organized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- ytvis_2019
+            -- train
+            -- val
+            -- annotations
+            		-- instances_train_sub.json
+            		-- instances_val_sub.json
+            		-- ytvis19_cocofmt.json
+        -- ytvis_2021
+            -- train
+            -- val
+            -- annotations
+            		-- instances_train_sub.json
+            		-- instances_val_sub.json
+            		-- ytvis21_cocofmt.json
+        -- ovis
+            -- train
+            -- val		
+            -- annotations_train.json
+            -- annotations_valid.json
+            -- ovis_cocofmt.json
+```
+
+
+
+### SA1B
+
+We downloaded data from the [SA1B](https://ai.meta.com/datasets/segment-anything-downloads/) official website, and only use [sa_000000.tar ~ sa_000050.tar] to  preprocess into the required format and train the model. First, perform NMS operations on each sa_n directory to keep the larger object-level masks by running :
+
+```python
+python3 convert_sam2coco_rewritresa1b.py  --src sa_000000
+python3 convert_sam2coco_rewritresa1b.py  --src sa_000001
+python3 convert_sam2coco_rewritresa1b.py  --src sa_000002
+python3 convert_sam2coco_rewritresa1b.py  --src sa_000003
+...
+python3 convert_sam2coco_rewritresa1b.py  --src sa_000050
+```
+
+then merge all the annotations by running xxx.py.  
+
+``` python
+python3 merge_sa1b.py 
+```
+
+
+
+We expect that the data is organized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- SA1B
+            -- images
+            		-- sa_000000
+            				-- sa_1.jpg
+            				-- sa_1.json
+            				-- ...
+            		-- sa_000001
+            		-- ...
+            -- sa1b_subtrain_500k.json
+            -- sa1b_subtrain_1m.json
+            -- sa1b_subtrain_2m.json
+            
+           
+```
+
+
+
+
+
+### UVO
+
+Please download [UVO](https://sites.google.com/view/unidentified-video-object/dataset)  from the offical website, and download our preprocessed annotation file [annotations](): 
+
+ We expect that the data is organized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- UVO
+            -- uvo_videos_dense_frames_jpg
+            -- uvo_videos_sparse_frames_jpg
+            -- uvo_videos_frames
+            -- annotations
+            		-- FrameSet
+            				-- UVO_frame_train_onecate.json
+            				-- UVO_frame_val_onecate.json
+            		-- VideoDenseSet
+            				-- UVO_video_train_dense_objectlabel.json
+            				-- UVO_video_val_dense_objectlabel.json
+    
+```
+
+
+
+### Objects365 and others
+
+Following UNINEXT, we prepare **Objects365, RefCOCO series, YouTubeVOS, Ref-YouTubeVOS, and BDD** data, and we expect that they are organized as below:
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- Objects365v2
+            -- annotations
+                -- zhiyuan_objv2_train_new.json
+                -- zhiyuan_objv2_val_new.json
+            -- images
+        -- annotations
+            -- refcoco-unc
+            -- refcocog-umd
+            -- refcocoplus-unc
+        -- ytbvos18
+            -- train
+            -- val
+        -- ref-youtube-vos
+            -- meta_expressions
+            -- train
+            -- valid
+            -- train.json
+            -- valid.json
+            -- RVOS_refcocofmt.json
+        -- bdd
+            -- images
+                -- 10k
+                -- 100k
+                -- seg_track_20
+                -- track
+            -- labels
+                -- box_track_20
+                -- det_20
+                -- ins_seg
+                -- seg_track_20
+
+
+```
+
+RVOS_refcocofmt.json is the conversion of the annotation of ref-youtube-vos into the format of RefCOCO, which is used for image-level training. It can be converted by run ```python3 conversion/ref-ytbvos-conversion.py```
+
+
+
+
+
+## For Evaluation Only
+
+The following datasets are only used for zero-shot evaluation, and are not used in joint-training. 
+
+### OmniLabel
+
+Please download [OmniLabel](https://www.omnilabel.org/dataset/download) from the offical website, and download our converted annotation in coco formation: [omnilabel](). we expect that the data is organized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- omnilabel
+            -- images
+            		-- coco
+            		-- object365
+            		-- openimagesv5
+            -- omnilabel_coco.json
+            -- omnilabel_obj365.json
+            -- omnilabel_openimages.json
+            -- omnilabel_cocofmt.json
+```
+
+### ODinW
+
+We follow [GLIP](https://github.com/microsoft/GLIP) to prepare the ODinW 35 dataset, and run ```python3 download.py ``` to download it:
+
+rganized as below. 
+
+```
+${GLEE_ROOT}
+    -- datasets
+        -- odinw 
+            -- dataset
+            		-- coAerialMaritimeDroneco
+            		-- CottontailRabbits
+            		-- NorthAmericaMushrooms
+            		-- ...
+           
+```
+
+### 
+
+## Updating...
+
+### TAO 
+
+### BURST
+
+### LV-VIS
+
+### MOSE
+
--- a/assets/INSTALL.md
+++ b/assets/INSTALL.md
@ -0,0 +1,34 @@
+# Install
+## Requirements
+We test the codes in the following environments, other versions may also be compatible but Pytorch vision should be >= 1.7
+
+- CUDA 12.1
+- Python 3.9.2
+- Pytorch 2.1.0
+- Torchvison 0.16.0
+
+## Install environment for UNINEXT
+
+```
+pip3 install shapely==1.7.1
+pip3 install lvis
+pip3 install scipy
+pip3 install fairscale
+pip3 install einops 
+pip3 install xformers
+pip3 install tensorboard 
+pip3 install opencv-python-headless 
+pip3 install timm
+pip3 install ftfy
+pip3 install transformers==4.36.0
+
+pip3 install -e .  
+pip3 install git+https://github.com/wjf5203/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI" --user
+
+
+
+# compile Deformable DETR
+cd projects/GLEE/glee/models/pixel_decoder/ops/
+python3 setup.py build install --user
+
+```
--- a/assets/MODEL_ZOO.md
+++ b/assets/MODEL_ZOO.md
@ -0,0 +1,41 @@
+# GLEE MODEL ZOO
+
+## Introduction
+GLEE maintains state-of-the-art (SOTA) performance across multiple tasks while preserving versatility and openness, demonstrating strong generalization capabilities. Here, we provide the model weights for all three stages of GLEE: '-pretrain', '-joint', and '-scaleup'. The '-pretrain' weights refer to those pretrained on Objects365 and OpenImages, yielding effective initializations from over three million detection data. The '-joint' weights are derived from joint training on 15 datasets, where the model achieves optimal performance. The '-scaleup' weights are obtained by incorporating additional automatically annotated SA1B and GRIT data, which enhance zero-shot performance and support a richer semantic understanding. Additionally, we offer weights fine-tuned on VOS data for interactive video tracking applications.
+
+###  Stage 1: Pretraining 
+
+|        Name        |                            Config                            |                            Weight                            |
+| :----------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| GLEE-Lite-pretrain |     Stage1_pretrain_openimage_obj365_CLIPfrozen_R50.yaml     | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_pretrain.pth) |
+| GLEE-Plus-pretrain |    Stage1_pretrain_openimage_obj365_CLIPfrozen_SwinL.yaml    | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_pretrain.pth) |
+| GLEE-Pro-pretrain  | Stage1_pretrain_openimage_obj365_CLIPfrozen_EVA02L_LSJ1536.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_pretrain.pth) |
+
+
+
+### Stage 2: Image-level Joint Training 
+
+|      Name       |                    Config                     |                            Weight                            |
+| :-------------: | :-------------------------------------------: | :----------------------------------------------------------: |
+| GLEE-Lite-joint |  Stage2_joint_training_CLIPteacher_R50.yaml   | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_joint.pth) |
+| GLEE-Plus-joint |    Stage2_joint_training_CLIPteacher_SwinL    | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_joint.pth) |
+| GLEE-Pro-joint  | Stage2_joint_training_CLIPteacher_EVA02L.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_joint.pth) |
+
+### Stage 3: Scale-up Training
+
+|       Name        |                 Config                 |                            Weight                            |
+| :---------------: | :------------------------------------: | :----------------------------------------------------------: |
+| GLEE-Lite-scaleup |  Stage3_scaleup_CLIPteacher_R50.yaml   | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_scaleup.pth) |
+| GLEE-Plus-scaleup |    Stage3_scaleup_CLIPteacher_SwinL    | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Plus_scaleup.pth) |
+| GLEE-Pro-scaleup  | Stage3_scaleup_CLIPteacher_EVA02L.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Pro_scaleup.pth) |
+
+ 
+
+
+
+### Single Tasks
+We also provide models trained on a VOS task with ResNet-50 backbone:
+
+|     Name      |           Config            |                            Weight                            |
+| :-----------: | :-------------------------: | :----------------------------------------------------------: |
+| GLEE-Lite-vos | VOS_joint_finetune_R50.yaml | [Model](https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/GLEE_Lite_vos.pth) |
--- a/assets/TEST.md
+++ b/assets/TEST.md
@ -0,0 +1,59 @@
+# Tutorial for Testing (Continuously Updated)
+
+GLEE can be directly tested on classic detection and segmentation datasets locally. For some video datasets, the results need to be submitted to Codalab for evaluation. Additionally, certain datasets such as TAO, BURST, and OmniLabel require evaluation using additional tools. We will continue to update the evaluation tutorials for all datasets reported in the paper here.
+
+
+
+## Detection，Instance Segmentation，REC&RES 
+
+GLEE can directly perform evaluations on COCO, Objects365, LVIS, and the RefCOCO series based on Detectron2. Typically, the Stage 2 yaml config file can be used, with manual adjustments made for the dataset to be inferred and the selection of weights to be downloaded from the  [MODEL_ZOO.md](MODEL_ZOO.md).
+
+To inference on COCO:
+
+```bash
+# Lite
+python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml  --num-gpus 8 --eval-only  MODEL.WEIGHTS path/to/GLEE_Lite_joint.pth  DATASETS.TEST '("coco_2017_val",)'
+
+# Plus
+python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Plus/Stage2_joint_training_CLIPteacher_SwinL.yaml  --num-gpus 8 --eval-only  MODEL.WEIGHTS path/to/GLEE_Plus_joint.pth DATASETS.TEST '("coco_2017_val",)'
+
+# Pro
+python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Pro/Stage2_joint_training_CLIPteacher_EVA02L.yaml  --num-gpus 8 --eval-only  MODEL.WEIGHTS path/to/GLEE_Pro_joint.pth DATASETS.TEST '("coco_2017_val",)'
+
+```
+
+Replace `"path/to/downloaded/weights"` with the actual path to the pretrained model weights and use `"DATASETS.TEST"` to specific the dataset you wish to evaluate on.
+
+`'("coco_2017_val",)'` can be replace by :
+
+```bash
+# Lite
+python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/Lite/Stage2_joint_training_CLIPteacher_R50.yaml  --num-gpus 8 --eval-only  MODEL.WEIGHTS path/to/GLEE_Lite_joint.pth  DATASETS.TEST 
+'("coco_2017_val",)'
+'("lvis_v1_minival",)'
+'("lvis_v1_val",)'
+'("objects365_v2_val",)'
+'("refcoco-unc-val",)'
+'("refcoco-unc-testA",)'
+'("refcoco-unc-testB",)'
+'("refcocoplus-unc-val",)'
+'("refcocoplus-unc-testA",)'
+'("refcocoplus-unc-testB",)'
+'("refcocog-umd-val",)'
+'("refcocog-umd-test",)'
+# Alternatively, to infer across all tasks at once:
+'("coco_2017_val","lvis_v1_minival","lvis_v1_val","objects365_v2_val","refcoco-unc-val","refcoco-unc-testA","refcoco-unc-testB","refcocoplus-unc-val","refcocoplus-unc-testA","refcocoplus-unc-testB","refcocog-umd-val","refcocog-umd-test",)'
+```
+
+
+
+ 
+
+# Video Tasks (Continuously Updated)
+
+
+
+
+
+# Omnilabel and ODinW (Continuously Updated)
+
--- a/assets/TRAIN.md
+++ b/assets/TRAIN.md
@ -0,0 +1,93 @@
+# Tutorial for Training
+
+GLEE has three training stages: (1) Objects365 & OpenImages pretraining (2) image-level joint training across 15 datasets (3) scale up training by integrating additional SA1B and GRIT data. Corresponding yaml files start with `Stage1`, `Stage2`, and `Stage3` respectively. 
+
+By default, we train GLEE using 64 A100 GPUs with the batchsize of 128. For fine-tuning on video tasks or novel downstream image tasks (ODinW), we default to using eight A100 GPUs. Users interested in specific datasets or aiming to further improve performance by training on individual datasets can adjust the `DATASETS` config within the YAML configuration file.
+
+We provide configurations for Stage 1, 2, and 3 training with three types of backbones—ResNet50, Swin-Large, and EVA02-Large—across the Lite, Plus, and Pro variants, under the [projects/GLEE/configs](../projects/GLEE/configs) folder.  For employing larger or novel backbones, it is advisable to initialize the components beyond the backbone with the pretrained weights from GLEE-Lite-joint to expedite convergence.
+
+
+
+## Pretrained Backbone Weights
+
+```bash
+# Language Model (CLIP text encoder)
+wget  -P projects/GLEE/clip_vit_base_patch32/  https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/GLEE/clip_vit_base_patch32/pytorch_model.bin   
+
+# R50 (GLEE_Lite) warmup initialized weight
+# The randomly initialized Transformer Decoder is difficult to converge when combined with the large vocabulary of Objects365 and OpenImages. 
+# It is recommended to use the Transformer weights of MaskDINO (with region proposal capability) to initialize and accelerate convergence.
+
+cd weights/
+wget https://huggingface.co/spaces/Junfeng5/GLEE_demo/resolve/main/MODEL_ZOO/converted_maskdino_r50_withoutclip.pth
+
+# Swin Large backbone weight
+wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth
+# EVA02-Large backbone weight
+wget https://huggingface.co/Yuxin-CV/EVA-02/resolve/main/eva02/pt/eva02_L_pt_m38m_p14to16.pt
+
+# Convert EVA02 weights
+python3 convert_eva02.py
+```
+
+Other pretrained GLEE models can be found in [MODEL_ZOO.md](MODEL_ZOO.md)
+
+
+
+
+
+## Joint Training
+
+
+
+To train from scratch, it is necessary to follow the sequence of stages 1, 2, and 3, executing the training scripts in order, with each stage building upon the weights from the previous one. 
+
+For training on a single machine, you can execute the following command:
+
+```bash
+python3 projects/GLEE/train_net.py --config-file projects/GLEE/configs/images/<config_stageX.yaml> --num-gpus 8
+```
+
+Replace `<config_stageX.yaml>` with the actual configuration file for each stage: 
+
+```
+${GLEE_ROOT} 
+    -- projects
+        -- GLEE
+        		-- configs
+            		-- images
+            				-- Lite
+            						-- Stage1_pretrain_openimage_obj365_CLIPfrozen_R50.yaml
+            						-- Stage2_joint_training_CLIPteacher_R50.yaml
+            						-- Stage3_scaleup_CLIPteacher_R50.yaml
+            				-- Plus
+            						-- Stage1_pretrain_openimage_obj365_CLIPfrozen_SwinL.yaml
+            						-- Stage2_joint_training_CLIPteacher_SwinL.yaml
+            						-- Stage3_scaleup_CLIPteacher_SwinL.yaml
+            				-- Pro
+            						-- Stage1_pretrain_openimage_obj365_CLIPfrozen_EVA02L_LSJ1536.yaml
+            						-- Stage2_joint_training_CLIPteacher_EVA02L.yaml
+            						-- Stage3_scaleup_CLIPteacher_EVA02L.yaml
+```
+
+
+
+Our standard setup involves training on multiple machines (64 x A100), for which you can use the distributed training script:
+
+```bash
+python3 launch.py --nn <num_machines>  --port <PORT> --worker_rank <Global_Rank> --master_address $<MASTER_ADDRESS>  --config-file projects/STAnything/configs/<config_stageX.yaml>
+```
+
+Here, `<num_machines>` should be replaced with the number of machines you intend to use, `<MASTER_ADDRESS>` should be the IP address of node 0. `<PORT>` should be the same among multiple nodes. , and `<config.yaml>` with the configuration file for the specific stage of training.
+
+
+
+
+
+
+
+# Finetune (Continuously Updated)
+
+We also provide fine-tuning scripts that enable the fine-tuning of GLEE on downstream tasks such as ODinW and various of video tasks to achieve better performance. 
+
+These will be made available  as soon as possible.
--- a/configs/Base-RCNN-C4.yaml
+++ b/configs/Base-RCNN-C4.yaml
@ -0,0 +1,18 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RPN:
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "Res5ROIHeads"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
--- a/configs/Base-RCNN-DilatedC5.yaml
+++ b/configs/Base-RCNN-DilatedC5.yaml
@ -0,0 +1,31 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RESNETS:
+    OUT_FEATURES: ["res5"]
+    RES5_DILATION: 2
+  RPN:
+    IN_FEATURES: ["res5"]
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["res5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
--- a/configs/Base-RCNN-FPN.yaml
+++ b/configs/Base-RCNN-FPN.yaml
@ -0,0 +1,42 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
--- a/configs/Base-RetinaNet.yaml
+++ b/configs/Base-RetinaNet.yaml
@ -0,0 +1,25 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
+  FPN:
+    IN_FEATURES: ["res3", "res4", "res5"]
+  RETINANET:
+    IOU_THRESHOLDS: [0.4, 0.5]
+    IOU_LABELS: [0, -1, 1]
+    SMOOTH_L1_LOSS_BETA: 0.0
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
--- a/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
+++ b/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  LOAD_PROPOSALS: True
+  RESNETS:
+    DEPTH: 50
+  PROPOSAL_GENERATOR:
+    NAME: "PrecomputedProposals"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
+  TEST: ("coco_2017_val",)
+  PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 2
--- a/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
+++ b/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/fcos_R_50_FPN_1x.py
+++ b/configs/COCO-Detection/fcos_R_50_FPN_1x.py
@ -0,0 +1,11 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.fcos import model
+from ..common.train import train
+
+dataloader.train.mapper.use_instance_mask = False
+optimizer.lr = 0.01
+
+model.backbone.bottom_up.freeze_at = 2
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
+++ b/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/retinanet_R_50_FPN_1x.py
+++ b/configs/COCO-Detection/retinanet_R_50_FPN_1x.py
@ -0,0 +1,11 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.retinanet import model
+from ..common.train import train
+
+dataloader.train.mapper.use_instance_mask = False
+model.backbone.bottom_up.freeze_at = 2
+optimizer.lr = 0.01
+
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
+++ b/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
@ -0,0 +1,5 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
+++ b/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
+++ b/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    PRE_NMS_TOPK_TEST: 12000
+    POST_NMS_TOPK_TEST: 2000
--- a/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
+++ b/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    POST_NMS_TOPK_TEST: 2000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.py
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.py
@ -0,0 +1,8 @@
+from ..common.train import train
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.mask_rcnn_c4 import model
+
+model.backbone.freeze_at = 2
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.py
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.py
@ -0,0 +1,8 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.mask_rcnn_fpn import model
+from ..common.train import train
+
+model.backbone.bottom_up.freeze_at = 2
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_giou.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x_giou.yaml
@ -0,0 +1,12 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    BBOX_REG_LOSS_TYPE: "giou"
+    BBOX_REG_LOSS_WEIGHT: 2.0
+  ROI_BOX_HEAD:
+    BBOX_REG_LOSS_TYPE: "giou"
+    BBOX_REG_LOSS_WEIGHT: 10.0
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_regnetx_4gf_dds_fpn_1x.py
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_regnetx_4gf_dds_fpn_1x.py
@ -0,0 +1,34 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.mask_rcnn_fpn import model
+from ..common.train import train
+
+from detectron2.config import LazyCall as L
+from detectron2.modeling.backbone import RegNet
+from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
+
+
+# Replace default ResNet with RegNetX-4GF from the DDS paper. Config source:
+# https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnetx/RegNetX-4.0GF_dds_8gpu.yaml#L4-L9  # noqa
+model.backbone.bottom_up = L(RegNet)(
+    stem_class=SimpleStem,
+    stem_width=32,
+    block_class=ResBottleneckBlock,
+    depth=23,
+    w_a=38.65,
+    w_0=96,
+    w_m=2.43,
+    group_width=40,
+    freeze_at=2,
+    norm="FrozenBN",
+    out_features=["s1", "s2", "s3", "s4"],
+)
+model.pixel_std = [57.375, 57.120, 58.395]
+
+optimizer.weight_decay = 5e-5
+train.init_checkpoint = (
+    "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906383/RegNetX-4.0GF_dds_8gpu.pyth"
+)
+# RegNets benefit from enabling cudnn benchmark mode
+train.cudnn_benchmark = True
--- a/configs/COCO-InstanceSegmentation/mask_rcnn_regnety_4gf_dds_fpn_1x.py
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_regnety_4gf_dds_fpn_1x.py
@ -0,0 +1,35 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco import dataloader
+from ..common.models.mask_rcnn_fpn import model
+from ..common.train import train
+
+from detectron2.config import LazyCall as L
+from detectron2.modeling.backbone import RegNet
+from detectron2.modeling.backbone.regnet import SimpleStem, ResBottleneckBlock
+
+
+# Replace default ResNet with RegNetY-4GF from the DDS paper. Config source:
+# https://github.com/facebookresearch/pycls/blob/2c152a6e5d913e898cca4f0a758f41e6b976714d/configs/dds_baselines/regnety/RegNetY-4.0GF_dds_8gpu.yaml#L4-L10  # noqa
+model.backbone.bottom_up = L(RegNet)(
+    stem_class=SimpleStem,
+    stem_width=32,
+    block_class=ResBottleneckBlock,
+    depth=22,
+    w_a=31.41,
+    w_0=96,
+    w_m=2.24,
+    group_width=64,
+    se_ratio=0.25,
+    freeze_at=2,
+    norm="FrozenBN",
+    out_features=["s1", "s2", "s3", "s4"],
+)
+model.pixel_std = [57.375, 57.120, 58.395]
+
+optimizer.weight_decay = 5e-5
+train.init_checkpoint = (
+    "https://dl.fbaipublicfiles.com/pycls/dds_baselines/160906838/RegNetY-4.0GF_dds_8gpu.pyth"
+)
+# RegNets benefit from enabling cudnn benchmark mode
+train.cudnn_benchmark = True
--- a/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
+++ b/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  KEYPOINT_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 0.5  # Keypoint AP degrades (though box AP improves) when using plain L1 loss
+  RPN:
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
--- a/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.py
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.py
@ -0,0 +1,8 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco_keypoint import dataloader
+from ..common.models.keypoint_rcnn_fpn import model
+from ..common.train import train
+
+model.backbone.bottom_up.freeze_at = 2
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,5 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
+++ b/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
@ -0,0 +1,12 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
+++ b/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  MASK_ON: True
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
--- a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.py
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.py
@ -0,0 +1,8 @@
+from ..common.optim import SGD as optimizer
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.data.coco_panoptic_separated import dataloader
+from ..common.models.panoptic_fpn import model
+from ..common.train import train
+
+model.backbone.bottom_up.freeze_at = 2
+train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
--- a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
@ -0,0 +1,5 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
--- a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
+++ b/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
@ -0,0 +1,27 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  # WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  # For better, more stable performance initialize from COCO
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
+  MASK_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 8
+# This is similar to the setting used in Mask R-CNN paper, Appendix A
+# But there are some differences, e.g., we did not initialize the output
+# layer using the corresponding classes from COCO
+INPUT:
+  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 2048
+  MAX_SIZE_TEST: 2048
+DATASETS:
+  TRAIN: ("cityscapes_fine_instance_seg_train",)
+  TEST: ("cityscapes_fine_instance_seg_val",)
+SOLVER:
+  BASE_LR: 0.01
+  STEPS: (18000,)
+  MAX_ITER: 24000
+  IMS_PER_BATCH: 8
+TEST:
+  EVAL_PERIOD: 8000
--- a/configs/Detectron1-Comparisons/README.md
+++ b/configs/Detectron1-Comparisons/README.md
@ -0,0 +1,84 @@
+
+Detectron2 model zoo's experimental settings and a few implementation details are different from Detectron.
+
+The differences in implementation details are shared in
+[Compatibility with Other Libraries](../../docs/notes/compatibility.md).
+
+The differences in model zoo's experimental settings include:
+* Use scale augmentation during training. This improves AP with lower training cost.
+* Use L1 loss instead of smooth L1 loss for simplicity. This sometimes improves box AP but may
+  affect other AP.
+* Use `POOLER_SAMPLING_RATIO=0` instead of 2. This does not significantly affect AP.
+* Use `ROIAlignV2`. This does not significantly affect AP.
+
+In this directory, we provide a few configs that __do not__ have the above changes.
+They mimic Detectron's behavior as close as possible,
+and provide a fair comparison of accuracy and speed against Detectron.
+
+<!--
+./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP --base-dir ../../../configs/Detectron1-Comparisons
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">kp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.219</td>
+<td align="center">0.038</td>
+<td align="center">3.1</td>
+<td align="center">36.9</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">137781054</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.313</td>
+<td align="center">0.071</td>
+<td align="center">5.0</td>
+<td align="center">53.1</td>
+<td align="center"></td>
+<td align="center">64.2</td>
+<td align="center">137781195</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.273</td>
+<td align="center">0.043</td>
+<td align="center">3.4</td>
+<td align="center">37.8</td>
+<td align="center">34.9</td>
+<td align="center"></td>
+<td align="center">137781281</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## Comparisons:
+
+* Faster R-CNN: Detectron's AP is 36.7, similar to ours.
+* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
+  [bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
+	compensated back by some parameter tuning.
+* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
+  See [this article](https://ppwwyyxx.com/blog/2021/Where-are-Pixels/) for details.
+
+For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).
--- a/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
+++ b/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
--- a/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
+++ b/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,27 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
--- a/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
+++ b/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_MASK_HEAD:
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
--- a/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
+++ b/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+++ b/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
+++ b/configs/LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
@ -0,0 +1,23 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/LVISv1-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
+++ b/configs/LVISv1-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
@ -0,0 +1,22 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1203
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v1_train",)
+  TEST: ("lvis_v1_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+SOLVER:
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+++ b/configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,22 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1203
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v1_train",)
+  TEST: ("lvis_v1_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+SOLVER:
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/LVISv1-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
+++ b/configs/LVISv1-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
@ -0,0 +1,26 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1203
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v1_train",)
+  TEST: ("lvis_v1_val",)
+SOLVER:
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000  # 180000 * 16 / 100000 ~ 28.8 epochs
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
--- a/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
+++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
@ -0,0 +1,12 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
--- a/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
+++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
+++ b/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
@ -0,0 +1,36 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 152
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+  ROI_HEADS:
+    NAME: "CascadeROIHeads"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NUM_CONV: 8
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  IMS_PER_BATCH: 128
+  STEPS: (35000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.16
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 2500
--- a/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
+++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: True
--- a/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
+++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
@ -0,0 +1,8 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
--- a/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
@ -0,0 +1,21 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "GN"
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+  ROI_MASK_HEAD:
+    NORM: "GN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
@ -0,0 +1,24 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "SyncBN"
+    STRIDE_IN_1X1: True
+  FPN:
+    NORM: "SyncBN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "SyncBN"
+  ROI_MASK_HEAD:
+    NORM: "SyncBN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+TEST:
+  PRECISE_BN:
+    ENABLED: True
--- a/configs/Misc/mmdet_mask_rcnn_R_50_FPN_1x.py
+++ b/configs/Misc/mmdet_mask_rcnn_R_50_FPN_1x.py
@ -0,0 +1,151 @@
+# An example config to train a mmdetection model using detectron2.
+
+from ..common.data.coco import dataloader
+from ..common.coco_schedule import lr_multiplier_1x as lr_multiplier
+from ..common.optim import SGD as optimizer
+from ..common.train import train
+
+from detectron2.modeling.mmdet_wrapper import MMDetDetector
+from detectron2.config import LazyCall as L
+
+model = L(MMDetDetector)(
+    detector=dict(
+        type="MaskRCNN",
+        pretrained="torchvision://resnet50",
+        backbone=dict(
+            type="ResNet",
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type="BN", requires_grad=True),
+            norm_eval=True,
+            style="pytorch",
+        ),
+        neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5),
+        rpn_head=dict(
+            type="RPNHead",
+            in_channels=256,
+            feat_channels=256,
+            anchor_generator=dict(
+                type="AnchorGenerator",
+                scales=[8],
+                ratios=[0.5, 1.0, 2.0],
+                strides=[4, 8, 16, 32, 64],
+            ),
+            bbox_coder=dict(
+                type="DeltaXYWHBBoxCoder",
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[1.0, 1.0, 1.0, 1.0],
+            ),
+            loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+        ),
+        roi_head=dict(
+            type="StandardRoIHead",
+            bbox_roi_extractor=dict(
+                type="SingleRoIExtractor",
+                roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
+                out_channels=256,
+                featmap_strides=[4, 8, 16, 32],
+            ),
+            bbox_head=dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.1, 0.1, 0.2, 0.2],
+                ),
+                reg_class_agnostic=False,
+                loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+            ),
+            mask_roi_extractor=dict(
+                type="SingleRoIExtractor",
+                roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0),
+                out_channels=256,
+                featmap_strides=[4, 8, 16, 32],
+            ),
+            mask_head=dict(
+                type="FCNMaskHead",
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0),
+            ),
+        ),
+        # model training and testing settings
+        train_cfg=dict(
+            rpn=dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    match_low_quality=True,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False,
+                ),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False,
+            ),
+            rpn_proposal=dict(
+                nms_pre=2000,
+                max_per_img=1000,
+                nms=dict(type="nms", iou_threshold=0.7),
+                min_bbox_size=0,
+            ),
+            rcnn=dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=True,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False,
+            ),
+        ),
+        test_cfg=dict(
+            rpn=dict(
+                nms_pre=1000,
+                max_per_img=1000,
+                nms=dict(type="nms", iou_threshold=0.7),
+                min_bbox_size=0,
+            ),
+            rcnn=dict(
+                score_thr=0.05,
+                nms=dict(type="nms", iou_threshold=0.5),
+                max_per_img=100,
+                mask_thr_binary=0.5,
+            ),
+        ),
+    ),
+    pixel_mean=[123.675, 116.280, 103.530],
+    pixel_std=[58.395, 57.120, 57.375],
+)
+
+dataloader.train.mapper.image_format = "RGB"  # torchvision pretrained model
+train.init_checkpoint = None  # pretrained model is loaded inside backbone
--- a/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
+++ b/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
@ -0,0 +1,26 @@
+# A large PanopticFPN for demo purposes.
+# Use GN on backbone to support semantic seg.
+# Use Cascade + Deform Conv to improve localization.
+_BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
+  RESNETS:
+    DEPTH: 101
+    NORM: "GN"
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (105000, 125000)
+  MAX_ITER: 135000
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.04
--- a/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
+++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
@ -0,0 +1,13 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
+MODEL:
+  # Train from random initialization.
+  WEIGHTS: ""
+  # It makes sense to divide by STD when training from scratch
+  # But it seems to make no difference on the results and C2's models didn't do this.
+  # So we keep things consistent with C2.
+  # PIXEL_STD: [57.375, 57.12, 58.395]
+  MASK_ON: True
+  BACKBONE:
+    FREEZE_AT: 0
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
--- a/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
+++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
@ -0,0 +1,19 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
+MODEL:
+  PIXEL_STD: [57.375, 57.12, 58.395]
+  WEIGHTS: ""
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False
+  BACKBONE:
+    FREEZE_AT: 0
+SOLVER:
+  # 9x schedule
+  IMS_PER_BATCH: 64  # 4x the standard
+  STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
+  MAX_ITER: 202500   # 90k * 9 / 4
+  BASE_LR: 0.08
+TEST:
+  EVAL_PERIOD: 2500
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
--- a/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
+++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
@ -0,0 +1,19 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml"
+MODEL:
+  PIXEL_STD: [57.375, 57.12, 58.395]
+  WEIGHTS: ""
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False
+  BACKBONE:
+    FREEZE_AT: 0
+SOLVER:
+  # 9x schedule
+  IMS_PER_BATCH: 64  # 4x the standard
+  STEPS: (187500, 197500)  # last 60/4==15k and last 20/4==5k
+  MAX_ITER: 202500   # 90k * 9 / 4
+  BASE_LR: 0.08
+TEST:
+  EVAL_PERIOD: 2500
+# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883
+# to learn what you need for training from scratch.
--- a/configs/Misc/semantic_R_50_FPN_1x.yaml
+++ b/configs/Misc/semantic_R_50_FPN_1x.yaml
@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
--- a/configs/Misc/torchvision_imagenet_R_50.py
+++ b/configs/Misc/torchvision_imagenet_R_50.py
@ -0,0 +1,150 @@
+"""
+An example config file to train a ImageNet classifier with detectron2.
+Model and dataloader both come from torchvision.
+This shows how to use detectron2 as a general engine for any new models and tasks.
+
+To run, use the following command:
+
+python tools/lazyconfig_train_net.py --config-file configs/Misc/torchvision_imagenet_R_50.py \
+    --num-gpus 8 dataloader.train.dataset.root=/path/to/imagenet/
+
+"""
+
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from omegaconf import OmegaConf
+import torchvision
+from torchvision.transforms import transforms as T
+from torchvision.models.resnet import ResNet, Bottleneck
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+
+from detectron2.solver import WarmupParamScheduler
+from detectron2.solver.build import get_default_optimizer_params
+from detectron2.config import LazyCall as L
+from detectron2.model_zoo import get_config
+from detectron2.data.samplers import TrainingSampler, InferenceSampler
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.utils import comm
+
+
+"""
+Note: Here we put reusable code (models, evaluation, data) together with configs just as a
+proof-of-concept, to easily demonstrate what's needed to train a ImageNet classifier in detectron2.
+Writing code in configs offers extreme flexibility but is often not a good engineering practice.
+In practice, you might want to put code in your project and import them instead.
+"""
+
+
+def build_data_loader(dataset, batch_size, num_workers, training=True):
+    return torch.utils.data.DataLoader(
+        dataset,
+        sampler=(TrainingSampler if training else InferenceSampler)(len(dataset)),
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+
+
+class ClassificationNet(nn.Module):
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model
+
+    @property
+    def device(self):
+        return list(self.model.parameters())[0].device
+
+    def forward(self, inputs):
+        image, label = inputs
+        pred = self.model(image.to(self.device))
+        if self.training:
+            label = label.to(self.device)
+            return F.cross_entropy(pred, label)
+        else:
+            return pred
+
+
+class ClassificationAcc(DatasetEvaluator):
+    def reset(self):
+        self.corr = self.total = 0
+
+    def process(self, inputs, outputs):
+        image, label = inputs
+        self.corr += (outputs.argmax(dim=1).cpu() == label.cpu()).sum().item()
+        self.total += len(label)
+
+    def evaluate(self):
+        all_corr_total = comm.all_gather([self.corr, self.total])
+        corr = sum(x[0] for x in all_corr_total)
+        total = sum(x[1] for x in all_corr_total)
+        return {"accuracy": corr / total}
+
+
+# --- End of code that could be in a project and be imported
+
+
+dataloader = OmegaConf.create()
+dataloader.train = L(build_data_loader)(
+    dataset=L(torchvision.datasets.ImageNet)(
+        root="/path/to/imagenet",
+        split="train",
+        transform=L(T.Compose)(
+            transforms=[
+                L(T.RandomResizedCrop)(size=224),
+                L(T.RandomHorizontalFlip)(),
+                T.ToTensor(),
+                L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+            ]
+        ),
+    ),
+    batch_size=256 // 8,
+    num_workers=4,
+    training=True,
+)
+
+dataloader.test = L(build_data_loader)(
+    dataset=L(torchvision.datasets.ImageNet)(
+        root="${...train.dataset.root}",
+        split="val",
+        transform=L(T.Compose)(
+            transforms=[
+                L(T.Resize)(size=256),
+                L(T.CenterCrop)(size=224),
+                T.ToTensor(),
+                L(T.Normalize)(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+            ]
+        ),
+    ),
+    batch_size=256 // 8,
+    num_workers=4,
+    training=False,
+)
+
+dataloader.evaluator = L(ClassificationAcc)()
+
+model = L(ClassificationNet)(
+    model=(ResNet)(block=Bottleneck, layers=[3, 4, 6, 3], zero_init_residual=True)
+)
+
+
+optimizer = L(torch.optim.SGD)(
+    params=L(get_default_optimizer_params)(),
+    lr=0.1,
+    momentum=0.9,
+    weight_decay=1e-4,
+)
+
+lr_multiplier = L(WarmupParamScheduler)(
+    scheduler=L(MultiStepParamScheduler)(
+        values=[1.0, 0.1, 0.01, 0.001], milestones=[30, 60, 90, 100]
+    ),
+    warmup_length=1 / 100,
+    warmup_factor=0.1,
+)
+
+
+train = get_config("common/train.py").train
+train.init_checkpoint = None
+train.max_iter = 100 * 1281167 // 256
--- a/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
+++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
--- a/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
+++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
--- a/configs/common/README.md
+++ b/configs/common/README.md
@ -0,0 +1,6 @@
+This directory provides definitions for a few common models, dataloaders, scheduler,
+and optimizers that are often used in training.
+The definition of these objects are provided in the form of lazy instantiation:
+their arguments can be edited by users before constructing the objects.
+
+They can be imported, or loaded by `model_zoo.get_config` API in users' own configs.
--- a/configs/common/coco_schedule.py
+++ b/configs/common/coco_schedule.py
@ -0,0 +1,47 @@
+from fvcore.common.param_scheduler import MultiStepParamScheduler
+
+from detectron2.config import LazyCall as L
+from detectron2.solver import WarmupParamScheduler
+
+
+def default_X_scheduler(num_X):
+    """
+    Returns the config for a default multi-step LR scheduler such as "1x", "3x",
+    commonly referred to in papers, where every 1x has the total length of 1440k
+    training images (~12 COCO epochs). LR is decayed twice at the end of training
+    following the strategy defined in "Rethinking ImageNet Pretraining", Sec 4.
+
+    Args:
+        num_X: a positive real number
+
+    Returns:
+        DictConfig: configs that define the multiplier for LR during training
+    """
+    # total number of iterations assuming 16 batch size, using 1440000/16=90000
+    total_steps_16bs = num_X * 90000
+
+    if num_X <= 2:
+        scheduler = L(MultiStepParamScheduler)(
+            values=[1.0, 0.1, 0.01],
+            # note that scheduler is scale-invariant. This is equivalent to
+            # milestones=[6, 8, 9]
+            milestones=[60000, 80000, 90000],
+        )
+    else:
+        scheduler = L(MultiStepParamScheduler)(
+            values=[1.0, 0.1, 0.01],
+            milestones=[total_steps_16bs - 60000, total_steps_16bs - 20000, total_steps_16bs],
+        )
+    return L(WarmupParamScheduler)(
+        scheduler=scheduler,
+        warmup_length=1000 / total_steps_16bs,
+        warmup_method="linear",
+        warmup_factor=0.001,
+    )
+
+
+lr_multiplier_1x = default_X_scheduler(1)
+lr_multiplier_2x = default_X_scheduler(2)
+lr_multiplier_3x = default_X_scheduler(3)
+lr_multiplier_6x = default_X_scheduler(6)
+lr_multiplier_9x = default_X_scheduler(9)
--- a/configs/common/data/coco.py
+++ b/configs/common/data/coco.py
@ -0,0 +1,48 @@
+from omegaconf import OmegaConf
+
+import detectron2.data.transforms as T
+from detectron2.config import LazyCall as L
+from detectron2.data import (
+    DatasetMapper,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+)
+from detectron2.evaluation import COCOEvaluator
+
+dataloader = OmegaConf.create()
+
+dataloader.train = L(build_detection_train_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
+    mapper=L(DatasetMapper)(
+        is_train=True,
+        augmentations=[
+            L(T.ResizeShortestEdge)(
+                short_edge_length=(640, 672, 704, 736, 768, 800),
+                sample_style="choice",
+                max_size=1333,
+            ),
+            L(T.RandomFlip)(horizontal=True),
+        ],
+        image_format="BGR",
+        use_instance_mask=True,
+    ),
+    total_batch_size=16,
+    num_workers=4,
+)
+
+dataloader.test = L(build_detection_test_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
+    mapper=L(DatasetMapper)(
+        is_train=False,
+        augmentations=[
+            L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333),
+        ],
+        image_format="${...train.mapper.image_format}",
+    ),
+    num_workers=4,
+)
+
+dataloader.evaluator = L(COCOEvaluator)(
+    dataset_name="${..test.dataset.names}",
+)
--- a/configs/common/data/coco_keypoint.py
+++ b/configs/common/data/coco_keypoint.py
@ -0,0 +1,13 @@
+from detectron2.data.detection_utils import create_keypoint_hflip_indices
+
+from .coco import dataloader
+
+dataloader.train.dataset.min_keypoints = 1
+dataloader.train.dataset.names = "keypoints_coco_2017_train"
+dataloader.test.dataset.names = "keypoints_coco_2017_val"
+
+dataloader.train.mapper.update(
+    use_instance_mask=False,
+    use_keypoint=True,
+    keypoint_hflip_indices=create_keypoint_hflip_indices(dataloader.train.dataset.names),
+)
--- a/configs/common/data/coco_panoptic_separated.py
+++ b/configs/common/data/coco_panoptic_separated.py
@ -0,0 +1,26 @@
+from detectron2.config import LazyCall as L
+from detectron2.evaluation import (
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    SemSegEvaluator,
+)
+
+from .coco import dataloader
+
+dataloader.train.dataset.names = "coco_2017_train_panoptic_separated"
+dataloader.train.dataset.filter_empty = False
+dataloader.test.dataset.names = "coco_2017_val_panoptic_separated"
+
+
+dataloader.evaluator = [
+    L(COCOEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+    L(SemSegEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+    L(COCOPanopticEvaluator)(
+        dataset_name="${...test.dataset.names}",
+    ),
+]
--- a/configs/common/models/cascade_rcnn.py
+++ b/configs/common/models/cascade_rcnn.py
@ -0,0 +1,36 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads
+
+from .mask_rcnn_fpn import model
+
+# arguments that don't exist for Cascade R-CNN
+[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
+
+model.roi_heads.update(
+    _target_=CascadeROIHeads,
+    box_heads=[
+        L(FastRCNNConvFCHead)(
+            input_shape=ShapeSpec(channels=256, height=7, width=7),
+            conv_dims=[],
+            fc_dims=[1024, 1024],
+        )
+        for k in range(3)
+    ],
+    box_predictors=[
+        L(FastRCNNOutputLayers)(
+            input_shape=ShapeSpec(channels=1024),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
+            cls_agnostic_bbox_reg=True,
+            num_classes="${...num_classes}",
+        )
+        for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
+    ],
+    proposal_matchers=[
+        L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
+        for th in [0.5, 0.6, 0.7]
+    ],
+)
--- a/configs/common/models/fcos.py
+++ b/configs/common/models/fcos.py
@ -0,0 +1,23 @@
+from detectron2.modeling.meta_arch.fcos import FCOS, FCOSHead
+
+from .retinanet import model
+
+model._target_ = FCOS
+
+del model.anchor_generator
+del model.box2box_transform
+del model.anchor_matcher
+del model.input_format
+
+# Use P5 instead of C5 to compute P6/P7
+# (Sec 2.2 of https://arxiv.org/abs/2006.09214)
+model.backbone.top_block.in_feature = "p5"
+model.backbone.top_block.in_channels = 256
+
+# New score threshold determined based on sqrt(cls_score * centerness)
+model.test_score_thresh = 0.2
+model.test_nms_thresh = 0.6
+
+model.head._target_ = FCOSHead
+del model.head.num_anchors
+model.head.norm = "GN"
--- a/configs/common/models/keypoint_rcnn_fpn.py
+++ b/configs/common/models/keypoint_rcnn_fpn.py
@ -0,0 +1,33 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import KRCNNConvDeconvUpsampleHead
+
+from .mask_rcnn_fpn import model
+
+[model.roi_heads.pop(x) for x in ["mask_in_features", "mask_pooler", "mask_head"]]
+
+model.roi_heads.update(
+    num_classes=1,
+    keypoint_in_features=["p2", "p3", "p4", "p5"],
+    keypoint_pooler=L(ROIPooler)(
+        output_size=14,
+        scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+        sampling_ratio=0,
+        pooler_type="ROIAlignV2",
+    ),
+    keypoint_head=L(KRCNNConvDeconvUpsampleHead)(
+        input_shape=ShapeSpec(channels=256, width=14, height=14),
+        num_keypoints=17,
+        conv_dims=[512] * 8,
+        loss_normalizer="visible",
+    ),
+)
+
+# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
+# 1000 proposals per-image is found to hurt box AP.
+# Therefore we increase it to 1500 per-image.
+model.proposal_generator.post_nms_topk = (1500, 1000)
+
+# Keypoint AP degrades (though box AP improves) when using plain L1 loss
+model.roi_heads.box_predictor.smooth_l1_beta = 0.5
--- a/configs/common/models/mask_rcnn_c4.py
+++ b/configs/common/models/mask_rcnn_c4.py
@ -0,0 +1,88 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.meta_arch import GeneralizedRCNN
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone import BasicStem, BottleneckBlock, ResNet
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
+from detectron2.modeling.roi_heads import (
+    FastRCNNOutputLayers,
+    MaskRCNNConvUpsampleHead,
+    Res5ROIHeads,
+)
+
+model = L(GeneralizedRCNN)(
+    backbone=L(ResNet)(
+        stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+        stages=L(ResNet.make_default_stages)(
+            depth=50,
+            stride_in_1x1=True,
+            norm="FrozenBN",
+        ),
+        out_features=["res4"],
+    ),
+    proposal_generator=L(RPN)(
+        in_features=["res4"],
+        head=L(StandardRPNHead)(in_channels=1024, num_anchors=15),
+        anchor_generator=L(DefaultAnchorGenerator)(
+            sizes=[[32, 64, 128, 256, 512]],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            strides=[16],
+            offset=0.0,
+        ),
+        anchor_matcher=L(Matcher)(
+            thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
+        ),
+        box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
+        batch_size_per_image=256,
+        positive_fraction=0.5,
+        pre_nms_topk=(12000, 6000),
+        post_nms_topk=(2000, 1000),
+        nms_thresh=0.7,
+    ),
+    roi_heads=L(Res5ROIHeads)(
+        num_classes=80,
+        batch_size_per_image=512,
+        positive_fraction=0.25,
+        proposal_matcher=L(Matcher)(
+            thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
+        ),
+        in_features=["res4"],
+        pooler=L(ROIPooler)(
+            output_size=14,
+            scales=(1.0 / 16,),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        res5=L(ResNet.make_stage)(
+            block_class=BottleneckBlock,
+            num_blocks=3,
+            stride_per_block=[2, 1, 1],
+            in_channels=1024,
+            bottleneck_channels=512,
+            out_channels=2048,
+            norm="FrozenBN",
+            stride_in_1x1=True,
+        ),
+        box_predictor=L(FastRCNNOutputLayers)(
+            input_shape=L(ShapeSpec)(channels="${...res5.out_channels}", height=1, width=1),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
+            num_classes="${..num_classes}",
+        ),
+        mask_head=L(MaskRCNNConvUpsampleHead)(
+            input_shape=L(ShapeSpec)(
+                channels="${...res5.out_channels}",
+                width="${...pooler.output_size}",
+                height="${...pooler.output_size}",
+            ),
+            num_classes="${..num_classes}",
+            conv_dims=[256],
+        ),
+    ),
+    pixel_mean=[103.530, 116.280, 123.675],
+    pixel_std=[1.0, 1.0, 1.0],
+    input_format="BGR",
+)
--- a/configs/common/models/mask_rcnn_fpn.py
+++ b/configs/common/models/mask_rcnn_fpn.py
@ -0,0 +1,93 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.meta_arch import GeneralizedRCNN
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+from detectron2.modeling.backbone import BasicStem, FPN, ResNet
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
+from detectron2.modeling.roi_heads import (
+    StandardROIHeads,
+    FastRCNNOutputLayers,
+    MaskRCNNConvUpsampleHead,
+    FastRCNNConvFCHead,
+)
+
+model = L(GeneralizedRCNN)(
+    backbone=L(FPN)(
+        bottom_up=L(ResNet)(
+            stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+            stages=L(ResNet.make_default_stages)(
+                depth=50,
+                stride_in_1x1=True,
+                norm="FrozenBN",
+            ),
+            out_features=["res2", "res3", "res4", "res5"],
+        ),
+        in_features="${.bottom_up.out_features}",
+        out_channels=256,
+        top_block=L(LastLevelMaxPool)(),
+    ),
+    proposal_generator=L(RPN)(
+        in_features=["p2", "p3", "p4", "p5", "p6"],
+        head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
+        anchor_generator=L(DefaultAnchorGenerator)(
+            sizes=[[32], [64], [128], [256], [512]],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64],
+            offset=0.0,
+        ),
+        anchor_matcher=L(Matcher)(
+            thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
+        ),
+        box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
+        batch_size_per_image=256,
+        positive_fraction=0.5,
+        pre_nms_topk=(2000, 1000),
+        post_nms_topk=(1000, 1000),
+        nms_thresh=0.7,
+    ),
+    roi_heads=L(StandardROIHeads)(
+        num_classes=80,
+        batch_size_per_image=512,
+        positive_fraction=0.25,
+        proposal_matcher=L(Matcher)(
+            thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
+        ),
+        box_in_features=["p2", "p3", "p4", "p5"],
+        box_pooler=L(ROIPooler)(
+            output_size=7,
+            scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        box_head=L(FastRCNNConvFCHead)(
+            input_shape=ShapeSpec(channels=256, height=7, width=7),
+            conv_dims=[],
+            fc_dims=[1024, 1024],
+        ),
+        box_predictor=L(FastRCNNOutputLayers)(
+            input_shape=ShapeSpec(channels=1024),
+            test_score_thresh=0.05,
+            box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
+            num_classes="${..num_classes}",
+        ),
+        mask_in_features=["p2", "p3", "p4", "p5"],
+        mask_pooler=L(ROIPooler)(
+            output_size=14,
+            scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
+            sampling_ratio=0,
+            pooler_type="ROIAlignV2",
+        ),
+        mask_head=L(MaskRCNNConvUpsampleHead)(
+            input_shape=ShapeSpec(channels=256, width=14, height=14),
+            num_classes="${..num_classes}",
+            conv_dims=[256, 256, 256, 256, 256],
+        ),
+    ),
+    pixel_mean=[103.530, 116.280, 123.675],
+    pixel_std=[1.0, 1.0, 1.0],
+    input_format="BGR",
+)
--- a/configs/common/models/panoptic_fpn.py
+++ b/configs/common/models/panoptic_fpn.py
@ -0,0 +1,20 @@
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import PanopticFPN
+from detectron2.modeling.meta_arch.semantic_seg import SemSegFPNHead
+
+from .mask_rcnn_fpn import model
+
+model._target_ = PanopticFPN
+model.sem_seg_head = L(SemSegFPNHead)(
+    input_shape={
+        f: L(ShapeSpec)(stride=s, channels="${....backbone.out_channels}")
+        for f, s in zip(["p2", "p3", "p4", "p5"], [4, 8, 16, 32])
+    },
+    ignore_value=255,
+    num_classes=54,  # COCO stuff + 1
+    conv_dims=128,
+    common_stride=4,
+    loss_weight=0.5,
+    norm="GN",
+)
--- a/configs/common/models/retinanet.py
+++ b/configs/common/models/retinanet.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+
+from detectron2.config import LazyCall as L
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.meta_arch import RetinaNet
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone.fpn import LastLevelP6P7
+from detectron2.modeling.backbone import BasicStem, FPN, ResNet
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.meta_arch.retinanet import RetinaNetHead
+
+model = L(RetinaNet)(
+    backbone=L(FPN)(
+        bottom_up=L(ResNet)(
+            stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+            stages=L(ResNet.make_default_stages)(
+                depth=50,
+                stride_in_1x1=True,
+                norm="FrozenBN",
+            ),
+            out_features=["res3", "res4", "res5"],
+        ),
+        in_features=["res3", "res4", "res5"],
+        out_channels=256,
+        top_block=L(LastLevelP6P7)(in_channels=2048, out_channels="${..out_channels}"),
+    ),
+    head=L(RetinaNetHead)(
+        # Shape for each input feature map
+        input_shape=[ShapeSpec(channels=256)] * 5,
+        num_classes="${..num_classes}",
+        conv_dims=[256, 256, 256, 256],
+        prior_prob=0.01,
+        num_anchors=9,
+    ),
+    anchor_generator=L(DefaultAnchorGenerator)(
+        sizes=[[x, x * 2 ** (1.0 / 3), x * 2 ** (2.0 / 3)] for x in [32, 64, 128, 256, 512]],
+        aspect_ratios=[0.5, 1.0, 2.0],
+        strides=[8, 16, 32, 64, 128],
+        offset=0.0,
+    ),
+    box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
+    anchor_matcher=L(Matcher)(
+        thresholds=[0.4, 0.5], labels=[0, -1, 1], allow_low_quality_matches=True
+    ),
+    num_classes=80,
+    head_in_features=["p3", "p4", "p5", "p6", "p7"],
+    focal_loss_alpha=0.25,
+    focal_loss_gamma=2.0,
+    pixel_mean=[103.530, 116.280, 123.675],
+    pixel_std=[1.0, 1.0, 1.0],
+    input_format="BGR",
+)
--- a/configs/common/optim.py
+++ b/configs/common/optim.py
@ -0,0 +1,15 @@
+import torch
+
+from detectron2.config import LazyCall as L
+from detectron2.solver.build import get_default_optimizer_params
+
+SGD = L(torch.optim.SGD)(
+    params=L(get_default_optimizer_params)(
+        # params.model is meant to be set to the model object, before instantiating
+        # the optimizer.
+        weight_decay_norm=0.0
+    ),
+    lr=0.02,
+    momentum=0.9,
+    weight_decay=1e-4,
+)
--- a/Show More
+++ b/Show More