2024-03-13 13:59:00 +08:00
|
|
|
# --------------------------------------------------------
|
|
|
|
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
|
|
|
# Copyright (c) 2022 Microsoft
|
|
|
|
# Licensed under The MIT License [see LICENSE for details]
|
|
|
|
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
|
|
|
|
# --------------------------------------------------------
|
|
|
|
# Copyright (c) 2024 Microsoft
|
|
|
|
# Licensed under The MIT License [see LICENSE for details]
|
|
|
|
# Written by Feng Li (fliay@connect.ust.hk)
|
|
|
|
# --------------------------------------------------------
|
|
|
|
|
|
|
|
##################
|
|
|
|
# Task settings
|
|
|
|
##################
|
|
|
|
WEIGHT: ''
|
|
|
|
PORT: 53711
|
|
|
|
VERBOSE: true
|
|
|
|
track_thresh_hold: 1
|
|
|
|
OUTPUT_DIR: '../../data/output/test'
|
|
|
|
# misc
|
|
|
|
LOADER:
|
|
|
|
JOINT: True
|
|
|
|
KEY_DATASET: 'coco'
|
|
|
|
# Spatial sampler
|
|
|
|
STROKE_SAMPLER:
|
|
|
|
MAX_CANDIDATE: 50
|
|
|
|
CANDIDATE_PROBS: [1.0] # for training only
|
|
|
|
CANDIDATE_NAMES: ["Scribble"]
|
|
|
|
DILATION: 3
|
|
|
|
SCRIBBLE:
|
|
|
|
NUM_STROKES: 5
|
|
|
|
STROKE_PRESET: ['rand_curve', 'rand_curve_small']
|
|
|
|
STROKE_PROB: [0.5, 0.5]
|
|
|
|
EVAL:
|
|
|
|
MODE: 'best' # best/random
|
|
|
|
NEGATIVE: False
|
|
|
|
MAX_ITER: 20
|
|
|
|
# model
|
|
|
|
MODEL:
|
|
|
|
NAME: dinov
|
|
|
|
HEAD: general_head
|
|
|
|
MASK_ON: false
|
|
|
|
KEYPOINT_ON: false
|
|
|
|
LOAD_PROPOSALS: false
|
|
|
|
DIM_PROJ: 512
|
|
|
|
BACKBONE_DIM: 768
|
|
|
|
BACKGROUND: False
|
|
|
|
WEIGHTS: ''
|
|
|
|
TEXT:
|
|
|
|
ARCH: noencoder
|
|
|
|
NAME: transformer
|
|
|
|
TOKENIZER: clip
|
|
|
|
CONTEXT_LENGTH: 18 # 77
|
|
|
|
WIDTH: 512
|
|
|
|
HEADS: 8
|
|
|
|
LAYERS: 12 # 6
|
|
|
|
AUTOGRESSIVE: True
|
|
|
|
BACKBONE:
|
|
|
|
NAME: swin
|
|
|
|
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
|
|
|
|
LOAD_PRETRAINED: true
|
|
|
|
SWIN:
|
|
|
|
PRETRAIN_IMG_SIZE: 224
|
|
|
|
PATCH_SIZE: 4
|
|
|
|
EMBED_DIM: 96
|
|
|
|
DEPTHS: [ 2, 2, 6, 2 ]
|
|
|
|
NUM_HEADS: [ 3, 6, 12, 24 ]
|
|
|
|
WINDOW_SIZE: 7
|
|
|
|
MLP_RATIO: 4.0
|
|
|
|
QKV_BIAS: true
|
|
|
|
QK_SCALE: ~
|
|
|
|
DROP_RATE: 0.0
|
|
|
|
ATTN_DROP_RATE: 0.0
|
|
|
|
DROP_PATH_RATE: 0.3
|
|
|
|
APE: false
|
|
|
|
PATCH_NORM: true
|
|
|
|
USE_CHECKPOINT: false
|
|
|
|
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
|
|
|
|
ENCODER:
|
|
|
|
NAME: encoder_deform
|
|
|
|
IGNORE_VALUE: 255
|
|
|
|
NUM_CLASSES: 133
|
|
|
|
LOSS_WEIGHT: 1.0
|
|
|
|
CONVS_DIM: 256
|
|
|
|
MASK_DIM: 256
|
|
|
|
NORM: "GN"
|
|
|
|
IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
|
|
|
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
|
|
|
|
COMMON_STRIDE: 4
|
|
|
|
TRANSFORMER_ENC_LAYERS: 6
|
|
|
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
|
|
|
NUM_FEATURE_LEVELS: 3
|
|
|
|
FEATURE_ORDER: "low2high"
|
|
|
|
DECODER:
|
|
|
|
NAME: dinov_openset_decoder
|
|
|
|
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
|
|
|
MASK: True
|
|
|
|
BOX: True
|
|
|
|
PART: True
|
|
|
|
GROUNDING:
|
|
|
|
ENABLED: False
|
|
|
|
MAX_LEN: 5
|
|
|
|
TEXT_WEIGHT: 2.0
|
|
|
|
CLASS_WEIGHT: 0.5
|
|
|
|
CAPTION:
|
|
|
|
ENABLED: False
|
|
|
|
PHRASE_PROB: 0.0
|
|
|
|
SIM_THRES: 0.95
|
|
|
|
CAPTIONING:
|
|
|
|
ENABLED: False
|
|
|
|
STEP: 50
|
|
|
|
RETRIEVAL:
|
|
|
|
ENABLED: False
|
|
|
|
DIM_IMG: 768
|
|
|
|
ENSEMBLE: True
|
|
|
|
OPENIMAGE:
|
|
|
|
ENABLED: False
|
|
|
|
NEGATIVE_SAMPLES: 5
|
|
|
|
GROUNDING:
|
|
|
|
ENABLED: False
|
|
|
|
MAX_LEN: 5
|
|
|
|
DEEP_SUPERVISION: True
|
|
|
|
NO_OBJECT_WEIGHT: 0.1
|
|
|
|
CLASS_WEIGHT: 4.0
|
|
|
|
MASK_WEIGHT: 5.0
|
|
|
|
DICE_WEIGHT: 5.0
|
|
|
|
BOX_WEIGHT: 5.0
|
|
|
|
GIOU_WEIGHT: 2.0
|
|
|
|
IOU_WEIGHT: 1.0
|
|
|
|
COST_CLASS_WEIGHT: 4.0
|
|
|
|
COST_DICE_WEIGHT: 5.0
|
|
|
|
COST_MASK_WEIGHT: 5.0
|
|
|
|
COST_BOX_WEIGHT: 5.0
|
|
|
|
COST_GIOU_WEIGHT: 2.0
|
|
|
|
MATCH_CLASS_WEIGHT: 2.0
|
|
|
|
HIDDEN_DIM: 256
|
|
|
|
NUM_OBJECT_QUERIES: 300
|
|
|
|
NHEADS: 8
|
|
|
|
DROPOUT: 0.0
|
|
|
|
DIM_FEEDFORWARD: 2048
|
|
|
|
ENC_LAYERS: 0
|
|
|
|
PRE_NORM: False
|
|
|
|
ENFORCE_INPUT_PROJ: False
|
|
|
|
SIZE_DIVISIBILITY: 32
|
|
|
|
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query
|
|
|
|
TRAIN_NUM_POINTS: 12544
|
|
|
|
OVERSAMPLE_RATIO: 3.0
|
|
|
|
IMPORTANCE_SAMPLE_RATIO: 0.75
|
|
|
|
TWO_STAGE: False
|
|
|
|
INITIALIZE_BOX_TYPE: 'no'
|
|
|
|
DN: seg
|
|
|
|
DN_NOISE_SCALE: 0.4
|
|
|
|
DN_NUM: 100
|
|
|
|
INITIAL_PRED: False
|
|
|
|
LEARN_TGT: False
|
|
|
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
|
|
|
SEMANTIC_CE_LOSS: False
|
|
|
|
PANO_BOX_LOSS: False
|
|
|
|
COCO: True
|
|
|
|
O365: False
|
|
|
|
SAM: True
|
|
|
|
COCO_TRACK: False
|
|
|
|
PASCAL: False
|
|
|
|
RE_POINT: True
|
|
|
|
NUM_INTERACTIVE_TOKENS: 6
|
|
|
|
NUM_CONTENT_TOKENS: 1
|
|
|
|
MAX_NUM_INSTANCE: 50
|
|
|
|
MAX_NUM_INSTANCE_CONTENT: 8
|
|
|
|
CONTENT_INDEPENDENT: True
|
|
|
|
USE_SHAPE_SAMPLER: True
|
|
|
|
OPENSET_USE_SHAPE_SAMPLER: False
|
|
|
|
INFERENCE_EXAMPLE: 4
|
|
|
|
MAX_TRAIN_EXAMPLE: 4
|
|
|
|
CROSS_GPU_BATCH: True
|
|
|
|
CONTENT_INDEPENDENT: True
|
|
|
|
NMS_THRESHOLD: 0.7
|
|
|
|
REFER_IMAGE: False
|
|
|
|
point_per_side: 20
|
2024-03-21 10:14:43 +08:00
|
|
|
MAX_MEMORY_SIZE: 9
|
2024-03-13 13:59:00 +08:00
|
|
|
softmax: True
|
|
|
|
MANY2MANY: True
|
|
|
|
VIS: False
|
|
|
|
freeze_all: False
|
|
|
|
freeze_backbone_enc: False
|
|
|
|
freeze_backbone_enc_decoder: False
|
|
|
|
TEST:
|
|
|
|
SEMANTIC_ON: True
|
|
|
|
INSTANCE_ON: True
|
|
|
|
PANOPTIC_ON: True
|
|
|
|
BOX_INTERACTIVE: False
|
|
|
|
CLASSIFICATION_ON: False
|
|
|
|
OVERLAP_THRESHOLD: 0.8
|
|
|
|
OBJECT_MASK_THRESHOLD: 0.25
|
|
|
|
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
|
|
|
|
TEST_FOUCUS_ON_BOX: False
|
|
|
|
PANO_TRANSFORM_EVAL: True
|
|
|
|
PANO_TEMPERATURE: 0.06
|
|
|
|
|
|
|
|
TEST:
|
|
|
|
EVAL_PERIOD: 500000
|
|
|
|
PRECISE_BN:
|
|
|
|
NUM_ITER: 1
|
|
|
|
ENABLED: False
|
|
|
|
AUG:
|
|
|
|
ENABLED: False
|
|
|
|
|
|
|
|
DATASETS:
|
|
|
|
TRAIN: ["sam_train", "coco_2017_train_panoptic"]
|
|
|
|
TEST: ["coco_2017_val_panoptic_with_sem_seg"]
|
|
|
|
# TEST: ["ade20k_panoptic_val", "ade20k_full_sem_seg_val", "sunrgbd_37_val_seg", "scannet_21_val_seg", "scannet_21_panoptic_val", "scannet_41_val_seg", "cityscapes_fine_panoptic_val", "cityscapes_fine_instance_seg_val", "cityscapes_fine_sem_seg_val", "bdd10k_val_sem_seg", "bdd10k_40_panoptic_val"]
|
|
|
|
CLASS_CONCAT: false
|
|
|
|
SIZE_DIVISIBILITY: 32
|
|
|
|
PROPOSAL_FILES_TRAIN: []
|
|
|
|
|
|
|
|
SAM:
|
|
|
|
INPUT:
|
|
|
|
MIN_SIZE_TEST: 800
|
|
|
|
MAX_SIZE_TEST: 1333
|
|
|
|
IMAGE_SIZE: 1024
|
|
|
|
MIN_SCALE: 0.99
|
|
|
|
MAX_SCALE: 1.01
|
|
|
|
DATASET_MAPPER_NAME: "sam_content"
|
|
|
|
IGNORE_VALUE: 255
|
|
|
|
COLOR_AUG_SSD: False
|
|
|
|
SIZE_DIVISIBILITY: 32
|
|
|
|
RANDOM_FLIP: "horizontal"
|
|
|
|
MASK_FORMAT: "polygon"
|
|
|
|
FORMAT: "RGB"
|
|
|
|
CROP:
|
|
|
|
ENABLED: True
|
|
|
|
DATASET:
|
|
|
|
DATASET: 'sam'
|
|
|
|
TEST:
|
|
|
|
DETECTIONS_PER_IMAGE: 100
|
|
|
|
NAME: coco_eval
|
|
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
|
|
USE_MULTISCALE: false
|
|
|
|
BATCH_SIZE_TOTAL: 8
|
|
|
|
MODEL_FILE: ''
|
|
|
|
AUG:
|
|
|
|
ENABLED: False
|
|
|
|
TRAIN:
|
|
|
|
BATCH_SIZE_TOTAL: 1
|
|
|
|
BATCH_SIZE_PER_GPU: 1
|
|
|
|
SHUFFLE: true
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 4
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
|
|
|
|
COCO:
|
|
|
|
INPUT:
|
|
|
|
MIN_SIZE_TEST: 800
|
|
|
|
MAX_SIZE_TEST: 1333
|
|
|
|
IMAGE_SIZE: 1024
|
|
|
|
MIN_SCALE: 0.1
|
|
|
|
MAX_SCALE: 2.0
|
|
|
|
DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
|
|
|
|
IGNORE_VALUE: 255
|
|
|
|
COLOR_AUG_SSD: False
|
|
|
|
SIZE_DIVISIBILITY: 32
|
|
|
|
RANDOM_FLIP: "horizontal"
|
|
|
|
MASK_FORMAT: "polygon"
|
|
|
|
FORMAT: "RGB"
|
|
|
|
CROP:
|
|
|
|
ENABLED: True
|
|
|
|
DATASET:
|
|
|
|
DATASET: 'coco'
|
|
|
|
TEST:
|
|
|
|
DETECTIONS_PER_IMAGE: 100
|
|
|
|
NAME: coco_eval
|
|
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
|
|
USE_MULTISCALE: false
|
|
|
|
BATCH_SIZE_TOTAL: 1
|
|
|
|
MODEL_FILE: ''
|
|
|
|
AUG:
|
|
|
|
ENABLED: False
|
|
|
|
TRAIN:
|
|
|
|
BATCH_SIZE_TOTAL: 1
|
|
|
|
BATCH_SIZE_PER_GPU: 1
|
|
|
|
SHUFFLE: true
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 2
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
|
|
|
|
INPUT:
|
|
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
|
|
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 16
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
|
|
|
|
# Detectron2 training config for optimizer and lr scheduler
|
|
|
|
SOLVER:
|
|
|
|
BASE_LR_END: 0.0
|
|
|
|
MOMENTUM: 0.9
|
|
|
|
NESTEROV: False
|
|
|
|
CHECKPOINT_PERIOD: 5000
|
|
|
|
IMS_PER_BATCH: 1
|
|
|
|
REFERENCE_WORLD_SIZE: 0
|
|
|
|
BIAS_LR_FACTOR: 1.0
|
|
|
|
WEIGHT_DECAY_BIAS: None
|
|
|
|
# original
|
|
|
|
BASE_LR: 0.0001
|
|
|
|
# STEPS: [327778, 355092]
|
|
|
|
STEPS: [67778, 105092]
|
|
|
|
# MAX_ITER: 368750
|
|
|
|
MAX_ITER: 168750
|
|
|
|
GAMMA: 0.1
|
|
|
|
WARMUP_FACTOR: 1.0
|
|
|
|
WARMUP_ITERS: 10
|
|
|
|
WARMUP_METHOD: "linear"
|
|
|
|
WEIGHT_DECAY: 0.05
|
|
|
|
OPTIMIZER: "ADAMW"
|
|
|
|
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
|
|
|
|
LR_MULTIPLIER:
|
|
|
|
backbone: 0.1
|
|
|
|
lang_encoder: 0.1
|
|
|
|
WEIGHT_DECAY_NORM: 0.0
|
|
|
|
WEIGHT_DECAY_EMBED: 0.0
|
|
|
|
CLIP_GRADIENTS:
|
|
|
|
ENABLED: True
|
|
|
|
CLIP_TYPE: "full_model"
|
|
|
|
CLIP_VALUE: 0.01
|
|
|
|
NORM_TYPE: 2.0
|
|
|
|
AMP:
|
|
|
|
ENABLED: True
|
|
|
|
|
|
|
|
# Evaluation Dataset
|
|
|
|
ADE20K:
|
|
|
|
INPUT:
|
|
|
|
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
|
|
|
|
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
|
|
|
MIN_SIZE_TEST: 640
|
|
|
|
MAX_SIZE_TRAIN: 2560
|
|
|
|
MAX_SIZE_TEST: 2560
|
|
|
|
MASK_FORMAT: "polygon"
|
|
|
|
CROP:
|
|
|
|
ENABLED: True
|
|
|
|
TYPE: "absolute"
|
|
|
|
SIZE: [640, 640]
|
|
|
|
SINGLE_CATEGORY_MAX_AREA: 1.0
|
|
|
|
IGNORE_VALUE: 255
|
|
|
|
COLOR_AUG_SSD: True
|
|
|
|
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
|
|
|
DATASET_MAPPER_NAME: "mask_former_panoptic"
|
|
|
|
FORMAT: "RGB"
|
|
|
|
DATASET:
|
|
|
|
DATASET: 'ade'
|
|
|
|
TRAIN:
|
|
|
|
ASPECT_RATIO_GROUPING: true
|
|
|
|
BATCH_SIZE_TOTAL: 16
|
|
|
|
BATCH_SIZE_PER_GPU: 2
|
|
|
|
SHUFFLE: true
|
|
|
|
TEST:
|
|
|
|
DETECTIONS_PER_IMAGE: 100
|
|
|
|
NAME: coco_eval
|
|
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
|
|
USE_MULTISCALE: false
|
|
|
|
BATCH_SIZE_TOTAL: 1
|
|
|
|
MODEL_FILE: ''
|
|
|
|
AUG:
|
|
|
|
ENABLED: False
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 8
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
|
|
|
|
DAVIS:
|
|
|
|
INPUT:
|
2024-03-21 10:14:43 +08:00
|
|
|
MIN_SIZE_TEST: 640
|
2024-03-13 13:59:00 +08:00
|
|
|
MAX_SIZE_TEST: 1333
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 0
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: False
|
|
|
|
TEST:
|
|
|
|
BATCH_SIZE_TOTAL: 1
|
|
|
|
|
|
|
|
VOS:
|
|
|
|
INPUT:
|
2024-03-21 10:14:43 +08:00
|
|
|
MIN_SIZE_TEST: 640
|
2024-03-13 13:59:00 +08:00
|
|
|
MAX_SIZE_TEST: 1333
|
|
|
|
DATALOADER:
|
|
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
|
|
NUM_WORKERS: 0
|
|
|
|
LOAD_PROPOSALS: False
|
|
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
|
|
ASPECT_RATIO_GROUPING: False
|
|
|
|
TEST:
|
|
|
|
BATCH_SIZE_TOTAL: 1
|