DINOv/configs/dinov_sam_coco_train.yaml

405 lines
9.5 KiB
YAML

# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------
# Copyright (c) 2024 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Feng Li (fliay@connect.ust.hk)
# --------------------------------------------------------
##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
track_thresh_hold: 1
OUTPUT_DIR: '../../data/output/test'
# misc
LOADER:
JOINT: True
KEY_DATASET: 'coco'
# Spatial sampler
STROKE_SAMPLER:
MAX_CANDIDATE: 50
CANDIDATE_PROBS: [1.0] # for training only
CANDIDATE_NAMES: ["Scribble"]
DILATION: 3
SCRIBBLE:
NUM_STROKES: 5
STROKE_PRESET: ['rand_curve', 'rand_curve_small']
STROKE_PROB: [0.5, 0.5]
EVAL:
MODE: 'best' # best/random
NEGATIVE: False
MAX_ITER: 20
# model
MODEL:
NAME: dinov
HEAD: general_head
MASK_ON: false
KEYPOINT_ON: false
LOAD_PROPOSALS: false
DIM_PROJ: 512
BACKBONE_DIM: 768
BACKGROUND: False
WEIGHTS: ''
TEXT:
ARCH: noencoder
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 18 # 77
WIDTH: 512
HEADS: 8
LAYERS: 12 # 6
AUTOGRESSIVE: True
BACKBONE:
NAME: swin
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
LOAD_PRETRAINED: true
SWIN:
PRETRAIN_IMG_SIZE: 224
PATCH_SIZE: 4
EMBED_DIM: 96
DEPTHS: [ 2, 2, 6, 2 ]
NUM_HEADS: [ 3, 6, 12, 24 ]
WINDOW_SIZE: 7
MLP_RATIO: 4.0
QKV_BIAS: true
QK_SCALE: ~
DROP_RATE: 0.0
ATTN_DROP_RATE: 0.0
DROP_PATH_RATE: 0.3
APE: false
PATCH_NORM: true
USE_CHECKPOINT: false
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
ENCODER:
NAME: encoder_deform
IGNORE_VALUE: 255
NUM_CLASSES: 133
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
TOTAL_NUM_FEATURE_LEVELS: 4
NUM_FEATURE_LEVELS: 3
FEATURE_ORDER: "low2high"
DECODER:
NAME: dinov_openset_decoder
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK: True
BOX: True
PART: True
GROUNDING:
ENABLED: False
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.0
SIM_THRES: 0.95
CAPTIONING:
ENABLED: False
STEP: 50
RETRIEVAL:
ENABLED: False
DIM_IMG: 768
ENSEMBLE: True
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 4.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
IOU_WEIGHT: 1.0
COST_CLASS_WEIGHT: 4.0
COST_DICE_WEIGHT: 5.0
COST_MASK_WEIGHT: 5.0
COST_BOX_WEIGHT: 5.0
COST_GIOU_WEIGHT: 2.0
MATCH_CLASS_WEIGHT: 2.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 300
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TWO_STAGE: False
INITIALIZE_BOX_TYPE: 'no'
DN: seg
DN_NOISE_SCALE: 0.4
DN_NUM: 100
INITIAL_PRED: False
LEARN_TGT: False
TOTAL_NUM_FEATURE_LEVELS: 4
SEMANTIC_CE_LOSS: False
PANO_BOX_LOSS: False
COCO: True
O365: False
SAM: True
COCO_TRACK: False
PASCAL: False
RE_POINT: True
NUM_INTERACTIVE_TOKENS: 6
NUM_CONTENT_TOKENS: 1
MAX_NUM_INSTANCE: 50
MAX_NUM_INSTANCE_CONTENT: 8
CONTENT_INDEPENDENT: True
USE_SHAPE_SAMPLER: True
OPENSET_USE_SHAPE_SAMPLER: False
INFERENCE_EXAMPLE: 4
MAX_TRAIN_EXAMPLE: 4
CROSS_GPU_BATCH: True
CONTENT_INDEPENDENT: True
NMS_THRESHOLD: 0.7
REFER_IMAGE: False
point_per_side: 20
MAX_MEMORY_SIZE: 9
softmax: True
MANY2MANY: True
VIS: False
freeze_all: False
freeze_backbone_enc: False
freeze_backbone_enc_decoder: False
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
BOX_INTERACTIVE: False
CLASSIFICATION_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.25
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
TEST_FOUCUS_ON_BOX: False
PANO_TRANSFORM_EVAL: True
PANO_TEMPERATURE: 0.06
TEST:
EVAL_PERIOD: 500000
PRECISE_BN:
NUM_ITER: 1
ENABLED: False
AUG:
ENABLED: False
DATASETS:
TRAIN: ["sam_train", "coco_2017_train_panoptic"]
TEST: ["coco_2017_val_panoptic_with_sem_seg"]
# TEST: ["ade20k_panoptic_val", "ade20k_full_sem_seg_val", "sunrgbd_37_val_seg", "scannet_21_val_seg", "scannet_21_panoptic_val", "scannet_41_val_seg", "cityscapes_fine_panoptic_val", "cityscapes_fine_instance_seg_val", "cityscapes_fine_sem_seg_val", "bdd10k_val_sem_seg", "bdd10k_40_panoptic_val"]
CLASS_CONCAT: false
SIZE_DIVISIBILITY: 32
PROPOSAL_FILES_TRAIN: []
SAM:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.99
MAX_SCALE: 1.01
DATASET_MAPPER_NAME: "sam_content"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'sam'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 1
BATCH_SIZE_PER_GPU: 1
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
COCO:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'coco'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 1
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 1
BATCH_SIZE_PER_GPU: 1
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 16
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
# Detectron2 training config for optimizer and lr scheduler
SOLVER:
BASE_LR_END: 0.0
MOMENTUM: 0.9
NESTEROV: False
CHECKPOINT_PERIOD: 5000
IMS_PER_BATCH: 1
REFERENCE_WORLD_SIZE: 0
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY_BIAS: None
# original
BASE_LR: 0.0001
# STEPS: [327778, 355092]
STEPS: [67778, 105092]
# MAX_ITER: 368750
MAX_ITER: 168750
GAMMA: 0.1
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WARMUP_METHOD: "linear"
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
LR_MULTIPLIER:
backbone: 0.1
lang_encoder: 0.1
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_EMBED: 0.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
# Evaluation Dataset
ADE20K:
INPUT:
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TRAIN: 2560
MAX_SIZE_TEST: 2560
MASK_FORMAT: "polygon"
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: [640, 640]
SINGLE_CATEGORY_MAX_AREA: 1.0
IGNORE_VALUE: 255
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 640 # used in dataset mapper
DATASET_MAPPER_NAME: "mask_former_panoptic"
FORMAT: "RGB"
DATASET:
DATASET: 'ade'
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 16
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 1
MODEL_FILE: ''
AUG:
ENABLED: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 8
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
DAVIS:
INPUT:
MIN_SIZE_TEST: 640
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1
VOS:
INPUT:
MIN_SIZE_TEST: 640
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1