Segment-Everything-Everywhe.../configs/seem/focall_unicl_lang_v0.yaml

401 lines
9.7 KiB
YAML

# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------
# Define Test/Trainer/Saving
PIPELINE: XDecoderPipeline
TRAINER: xdecoder
SAVE_DIR: '../../data/output/test'
base_path: "./"
# Resume Logistic
RESUME: false
WEIGHT: false
RESUME_FROM: ''
EVAL_AT_START: false
# Logging and Debug
WANDB: False
LOG_EVERY: 100
FIND_UNUSED_PARAMETERS: false
# Speed up training
FP16: false
PORT: '36873'
# misc
LOADER:
JOINT: False
KEY_DATASET: 'coco'
##################
# Task settings
##################
VERBOSE: true
MODEL:
NAME: seem_model_v0
HEAD: xdecoder_head
MASK_ON: false
KEYPOINT_ON: false
LOAD_PROPOSALS: false
DIM_PROJ: 512
TEXT:
ARCH: vlpencoder
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 77 # 77
WIDTH: 512
HEADS: 8
LAYERS: 12 # 6
AUTOGRESSIVE: True
BACKBONE:
NAME: focal
PRETRAINED: ''
LOAD_PRETRAINED: false
FOCAL:
PRETRAIN_IMG_SIZE: 224
PATCH_SIZE: 4
EMBED_DIM: 192
DEPTHS: [2, 2, 18, 2]
FOCAL_LEVELS: [4, 4, 4, 4]
FOCAL_WINDOWS: [3, 3, 3, 3]
DROP_PATH_RATE: 0.3
MLP_RATIO: 4.0
DROP_RATE: 0.0
PATCH_NORM: True
USE_CONV_EMBED: True
SCALING_MODULATOR: True
USE_CHECKPOINT: False
USE_POSTLN: true
USE_POSTLN_IN_MODULATION: false
USE_LAYERSCALE: True
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
OUT_INDICES: [0, 1, 2, 3]
ENCODER:
NAME: transformer_encoder_fpn
IGNORE_VALUE: 255
NUM_CLASSES: 133
LOSS_WEIGHT: 1.0
CONVS_DIM: 512
MASK_DIM: 512
NORM: "GN"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
DECODER:
NAME: seem_v0
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK:
ENABLED: True
DETECTION: False
SPATIAL:
ENABLED: True
MAX_ITER: 1
GROUNDING:
ENABLED: True
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
RETRIEVAL:
ENABLED: False
LVIS:
ENABLED: True
THRES: 0.7
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.5
SIM_THRES: 0.95
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
GCLASS_WEIGHT: 0.4
GMASK_WEIGHT: 1.0
GDICE_WEIGHT: 1.0
SCLASS_WEIGHT: 0.4
SMASK_WEIGHT: 1.0
SDICE_WEIGHT: 1.0
OCLASS_WEIGHT: 0.4
OMASK_WEIGHT: 1.0
ODICE_WEIGHT: 1.0
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BBOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
CAPTION_WEIGHT: 2.0
COST_SPATIAL:
CLASS_WEIGHT: 5.0
MASK_WEIGHT: 2.0
DICE_WEIGHT: 2.0
HIDDEN_DIM: 512
NUM_OBJECT_QUERIES: 101
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
MAX_SPATIAL_LEN: [512, 512, 512, 512]
# ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TOP_GROUNDING_LAYERS: 10
TOP_CAPTION_LAYERS: 10
TOP_SPATIAL_LAYERS: 10
TOP_OPENIMAGE_LAYERS: 10
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
# Spatial sampler
STROKE_SAMPLER:
MAX_CANDIDATE: 1
CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
DILATION: 3
CIRCLE:
NUM_STROKES: 5
STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
STROKE_PROB: [0.33, 0.33, 0.33]
SCRIBBLE:
NUM_STROKES: 5
STROKE_PRESET: ['rand_curve', 'rand_curve_small']
STROKE_PROB: [0.5, 0.5]
POINT:
NUM_POINTS: 20
POLYGON:
MAX_POINTS: 9
EVAL:
MODE: 'best' # best/random/best_random
NEGATIVE: False
MAX_ITER: 20
IOU_ITER: 1
GROUNDING: False
# Multi-modal Architecture, order matters
ATTENTION_ARCH:
VARIABLE:
queries: ['object', 'grounding', 'spatial']
tokens: ['grounding', 'spatial']
memories: ['spatial']
SELF_ATTENTION:
queries:
object: ['queries_object']
grounding: ['queries_grounding', 'tokens_grounding']
spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
tokens:
grounding: ['queries_grounding', 'tokens_grounding']
spatial: ['tokens_spatial']
memories:
spatial: ['memories_spatial']
CROSS_ATTENTION:
queries:
object: True
grounding: True
spatial: True
memories:
spatial: True
tokens:
grounding: False
spatial: False
MASKING: ['tokens_spatial', 'tokens_grounding']
DUPLICATION:
queries:
grounding: 'queries_object'
spatial: 'queries_object'
SPATIAL_MEMORIES: 32
QUERY_NUMBER: 3
DATASETS:
TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
# TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"] # to evaluate instance and semantic performance as well
# TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
CLASS_CONCAT: false
SIZE_DIVISIBILITY: 32
PROPOSAL_FILES_TRAIN: []
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 4
BATCH_SIZE_PER_GPU: 4
SHUFFLE: true
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 8
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
COCO:
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TRAIN_SAMPLING: 'choice'
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "coco_interactive"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'coco'
# Validation dataset
ADE20K:
INPUT:
MIN_SIZE_TRAIN: 640
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TRAIN: 2560
MAX_SIZE_TEST: 2560
MASK_FORMAT: "polygon"
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: (640, 640)
SINGLE_CATEGORY_MAX_AREA: 1.0
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 640 # used in dataset mapper
DATASET_MAPPER_NAME: "mask_former_panoptic"
FORMAT: "RGB"
DATASET:
DATASET: 'ade'
SBD:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1
VOC:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
DAVIS:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
VOS:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1
REF:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
FORMAT: "RGB"
SPATIAL: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
# Detectron2 training config for optimizer and lr scheduler
SOLVER:
BASE_LR: 0.0001
STEPS: [0.88889, 0.96296]
MAX_ITER: 1
GAMMA: 0.1
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WARMUP_METHOD: "linear"
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
LR_MULTIPLIER:
backbone: 0.1
lang_encoder: 0.1
FIX_PARAM:
backbone: True
lang_encoder: True
pixel_decoder: True
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_EMBED: 0.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 5.0 # 0.01
NORM_TYPE: 2.0
MAX_NUM_EPOCHS: 50