mirror of https://github.com/FoundationVision/GLEE
add video training and infer scripts for GLEE-Pro and fix improve infer speed
parent
5f1832dd7a
commit
028ecee13c
|
@ -0,0 +1,42 @@
|
|||
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||
MODEL:
|
||||
CROSS_TRACK: False
|
||||
PSEUDO_VIDEO: False
|
||||
FREEZE_WHOLE: False
|
||||
BACKBONE:
|
||||
NAME: "D2_EVA02"
|
||||
EVA02:
|
||||
CHECKPOINT: False
|
||||
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||
SEM_SEG_HEAD:
|
||||
# pixel decoder
|
||||
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||
DIM_FEEDFORWARD: 2048
|
||||
NUM_FEATURE_LEVELS: 4
|
||||
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
COMMON_STRIDE: 4
|
||||
TRANSFORMER_ENC_LAYERS: 6
|
||||
FEATURE_ORDER: "low2high"
|
||||
DATASETS:
|
||||
TRAIN: ("BURST_video_train",)
|
||||
TEST: ("BURST_video_val",)
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 8
|
||||
BASE_LR: 0.0001
|
||||
STEPS: (6000, )
|
||||
MAX_ITER: 8000
|
||||
CHECKPOINT_PERIOD: 2000
|
||||
INPUT:
|
||||
IMAGE_SIZE: 1536
|
||||
MIN_SCALE: 0.1
|
||||
MAX_SCALE: 2.0
|
||||
FORMAT: "RGB"
|
||||
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||
TEST:
|
||||
EVAL_PERIOD: 100000
|
||||
DATALOADER:
|
||||
FILTER_EMPTY_ANNOTATIONS: False
|
||||
NUM_WORKERS: 8
|
||||
OUTPUT_DIR: ./GLEE_Pro_BURST
|
|
@ -0,0 +1,42 @@
|
|||
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||
MODEL:
|
||||
CROSS_TRACK: False
|
||||
PSEUDO_VIDEO: False
|
||||
FREEZE_WHOLE: False
|
||||
BACKBONE:
|
||||
NAME: "D2_EVA02"
|
||||
EVA02:
|
||||
CHECKPOINT: False
|
||||
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||
SEM_SEG_HEAD:
|
||||
# pixel decoder
|
||||
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||
DIM_FEEDFORWARD: 2048
|
||||
NUM_FEATURE_LEVELS: 4
|
||||
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
COMMON_STRIDE: 4
|
||||
TRANSFORMER_ENC_LAYERS: 6
|
||||
FEATURE_ORDER: "low2high"
|
||||
DATASETS:
|
||||
TRAIN: ("BURST_video_train",)
|
||||
TEST: ("TAO_video_val",)
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 8
|
||||
BASE_LR: 0.0001
|
||||
STEPS: (6000, )
|
||||
MAX_ITER: 8000
|
||||
CHECKPOINT_PERIOD: 2000
|
||||
INPUT:
|
||||
IMAGE_SIZE: 1536
|
||||
MIN_SCALE: 0.1
|
||||
MAX_SCALE: 2.0
|
||||
FORMAT: "RGB"
|
||||
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||
TEST:
|
||||
EVAL_PERIOD: 100000
|
||||
DATALOADER:
|
||||
FILTER_EMPTY_ANNOTATIONS: False
|
||||
NUM_WORKERS: 8
|
||||
OUTPUT_DIR: ./GLEE_Pro_TAO
|
|
@ -0,0 +1,42 @@
|
|||
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||
MODEL:
|
||||
CROSS_TRACK: False
|
||||
PSEUDO_VIDEO: False
|
||||
FREEZE_WHOLE: False
|
||||
BACKBONE:
|
||||
NAME: "D2_EVA02"
|
||||
EVA02:
|
||||
CHECKPOINT: False
|
||||
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||
SEM_SEG_HEAD:
|
||||
# pixel decoder
|
||||
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||
DIM_FEEDFORWARD: 2048
|
||||
NUM_FEATURE_LEVELS: 4
|
||||
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
COMMON_STRIDE: 4
|
||||
TRANSFORMER_ENC_LAYERS: 6
|
||||
FEATURE_ORDER: "low2high"
|
||||
DATASETS:
|
||||
TRAIN: ("ovis_train",)
|
||||
TEST: ("ovis_val",)
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 8
|
||||
BASE_LR: 0.0001
|
||||
STEPS: (12000, )
|
||||
MAX_ITER: 18000
|
||||
CHECKPOINT_PERIOD: 2000
|
||||
INPUT:
|
||||
IMAGE_SIZE: 1536
|
||||
MIN_SCALE: 0.1
|
||||
MAX_SCALE: 2.0
|
||||
FORMAT: "RGB"
|
||||
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||
TEST:
|
||||
EVAL_PERIOD: 100000
|
||||
DATALOADER:
|
||||
FILTER_EMPTY_ANNOTATIONS: True
|
||||
NUM_WORKERS: 8
|
||||
OUTPUT_DIR: ./GLEE_Pro_ovis
|
|
@ -0,0 +1,42 @@
|
|||
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||
MODEL:
|
||||
CROSS_TRACK: False
|
||||
PSEUDO_VIDEO: False
|
||||
FREEZE_WHOLE: False
|
||||
BACKBONE:
|
||||
NAME: "D2_EVA02"
|
||||
EVA02:
|
||||
CHECKPOINT: False
|
||||
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||
SEM_SEG_HEAD:
|
||||
# pixel decoder
|
||||
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||
DIM_FEEDFORWARD: 2048
|
||||
NUM_FEATURE_LEVELS: 4
|
||||
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||
COMMON_STRIDE: 4
|
||||
TRANSFORMER_ENC_LAYERS: 6
|
||||
FEATURE_ORDER: "low2high"
|
||||
DATASETS:
|
||||
TRAIN: ("ytvis_2019_train", )
|
||||
TEST: ("ytvis_2019_val",)
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 8
|
||||
BASE_LR: 0.0001
|
||||
STEPS: (6000, )
|
||||
MAX_ITER: 8000
|
||||
CHECKPOINT_PERIOD: 2000
|
||||
INPUT:
|
||||
IMAGE_SIZE: 1536
|
||||
MIN_SCALE: 0.1
|
||||
MAX_SCALE: 2.0
|
||||
FORMAT: "RGB"
|
||||
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||
TEST:
|
||||
EVAL_PERIOD: 100000
|
||||
DATALOADER:
|
||||
FILTER_EMPTY_ANNOTATIONS: True
|
||||
NUM_WORKERS: 8
|
||||
OUTPUT_DIR: ./GLEE_Pro_ytvis19
|
|
@ -222,8 +222,9 @@ def instances_to_coco_json_video(inputs, outputs):
|
|||
segms.append(dummy_seg)
|
||||
_boxes.append(None)
|
||||
else:
|
||||
segms.append(mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0])
|
||||
if _mask.sum()>5 and _box is not None:
|
||||
rle = mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
|
||||
segms.append(rle)
|
||||
if mask_util.area(rle)>5 and _box is not None:
|
||||
_boxes.append(_box.tolist())
|
||||
|
||||
for rle in segms:
|
||||
|
|
Loading…
Reference in New Issue