mirror of https://github.com/FoundationVision/GLEE
add video training and infer scripts for GLEE-Pro and fix improve infer speed
parent
5f1832dd7a
commit
028ecee13c
|
@ -0,0 +1,42 @@
|
||||||
|
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||||
|
MODEL:
|
||||||
|
CROSS_TRACK: False
|
||||||
|
PSEUDO_VIDEO: False
|
||||||
|
FREEZE_WHOLE: False
|
||||||
|
BACKBONE:
|
||||||
|
NAME: "D2_EVA02"
|
||||||
|
EVA02:
|
||||||
|
CHECKPOINT: False
|
||||||
|
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||||
|
SEM_SEG_HEAD:
|
||||||
|
# pixel decoder
|
||||||
|
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||||
|
DIM_FEEDFORWARD: 2048
|
||||||
|
NUM_FEATURE_LEVELS: 4
|
||||||
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||||
|
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
COMMON_STRIDE: 4
|
||||||
|
TRANSFORMER_ENC_LAYERS: 6
|
||||||
|
FEATURE_ORDER: "low2high"
|
||||||
|
DATASETS:
|
||||||
|
TRAIN: ("BURST_video_train",)
|
||||||
|
TEST: ("BURST_video_val",)
|
||||||
|
SOLVER:
|
||||||
|
IMS_PER_BATCH: 8
|
||||||
|
BASE_LR: 0.0001
|
||||||
|
STEPS: (6000, )
|
||||||
|
MAX_ITER: 8000
|
||||||
|
CHECKPOINT_PERIOD: 2000
|
||||||
|
INPUT:
|
||||||
|
IMAGE_SIZE: 1536
|
||||||
|
MIN_SCALE: 0.1
|
||||||
|
MAX_SCALE: 2.0
|
||||||
|
FORMAT: "RGB"
|
||||||
|
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||||
|
TEST:
|
||||||
|
EVAL_PERIOD: 100000
|
||||||
|
DATALOADER:
|
||||||
|
FILTER_EMPTY_ANNOTATIONS: False
|
||||||
|
NUM_WORKERS: 8
|
||||||
|
OUTPUT_DIR: ./GLEE_Pro_BURST
|
|
@ -0,0 +1,42 @@
|
||||||
|
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||||
|
MODEL:
|
||||||
|
CROSS_TRACK: False
|
||||||
|
PSEUDO_VIDEO: False
|
||||||
|
FREEZE_WHOLE: False
|
||||||
|
BACKBONE:
|
||||||
|
NAME: "D2_EVA02"
|
||||||
|
EVA02:
|
||||||
|
CHECKPOINT: False
|
||||||
|
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||||
|
SEM_SEG_HEAD:
|
||||||
|
# pixel decoder
|
||||||
|
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||||
|
DIM_FEEDFORWARD: 2048
|
||||||
|
NUM_FEATURE_LEVELS: 4
|
||||||
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||||
|
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
COMMON_STRIDE: 4
|
||||||
|
TRANSFORMER_ENC_LAYERS: 6
|
||||||
|
FEATURE_ORDER: "low2high"
|
||||||
|
DATASETS:
|
||||||
|
TRAIN: ("BURST_video_train",)
|
||||||
|
TEST: ("TAO_video_val",)
|
||||||
|
SOLVER:
|
||||||
|
IMS_PER_BATCH: 8
|
||||||
|
BASE_LR: 0.0001
|
||||||
|
STEPS: (6000, )
|
||||||
|
MAX_ITER: 8000
|
||||||
|
CHECKPOINT_PERIOD: 2000
|
||||||
|
INPUT:
|
||||||
|
IMAGE_SIZE: 1536
|
||||||
|
MIN_SCALE: 0.1
|
||||||
|
MAX_SCALE: 2.0
|
||||||
|
FORMAT: "RGB"
|
||||||
|
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||||
|
TEST:
|
||||||
|
EVAL_PERIOD: 100000
|
||||||
|
DATALOADER:
|
||||||
|
FILTER_EMPTY_ANNOTATIONS: False
|
||||||
|
NUM_WORKERS: 8
|
||||||
|
OUTPUT_DIR: ./GLEE_Pro_TAO
|
|
@ -0,0 +1,42 @@
|
||||||
|
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||||
|
MODEL:
|
||||||
|
CROSS_TRACK: False
|
||||||
|
PSEUDO_VIDEO: False
|
||||||
|
FREEZE_WHOLE: False
|
||||||
|
BACKBONE:
|
||||||
|
NAME: "D2_EVA02"
|
||||||
|
EVA02:
|
||||||
|
CHECKPOINT: False
|
||||||
|
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||||
|
SEM_SEG_HEAD:
|
||||||
|
# pixel decoder
|
||||||
|
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||||
|
DIM_FEEDFORWARD: 2048
|
||||||
|
NUM_FEATURE_LEVELS: 4
|
||||||
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||||
|
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
COMMON_STRIDE: 4
|
||||||
|
TRANSFORMER_ENC_LAYERS: 6
|
||||||
|
FEATURE_ORDER: "low2high"
|
||||||
|
DATASETS:
|
||||||
|
TRAIN: ("ovis_train",)
|
||||||
|
TEST: ("ovis_val",)
|
||||||
|
SOLVER:
|
||||||
|
IMS_PER_BATCH: 8
|
||||||
|
BASE_LR: 0.0001
|
||||||
|
STEPS: (12000, )
|
||||||
|
MAX_ITER: 18000
|
||||||
|
CHECKPOINT_PERIOD: 2000
|
||||||
|
INPUT:
|
||||||
|
IMAGE_SIZE: 1536
|
||||||
|
MIN_SCALE: 0.1
|
||||||
|
MAX_SCALE: 2.0
|
||||||
|
FORMAT: "RGB"
|
||||||
|
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||||
|
TEST:
|
||||||
|
EVAL_PERIOD: 100000
|
||||||
|
DATALOADER:
|
||||||
|
FILTER_EMPTY_ANNOTATIONS: True
|
||||||
|
NUM_WORKERS: 8
|
||||||
|
OUTPUT_DIR: ./GLEE_Pro_ovis
|
|
@ -0,0 +1,42 @@
|
||||||
|
_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml"
|
||||||
|
MODEL:
|
||||||
|
CROSS_TRACK: False
|
||||||
|
PSEUDO_VIDEO: False
|
||||||
|
FREEZE_WHOLE: False
|
||||||
|
BACKBONE:
|
||||||
|
NAME: "D2_EVA02"
|
||||||
|
EVA02:
|
||||||
|
CHECKPOINT: False
|
||||||
|
# PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth'
|
||||||
|
SEM_SEG_HEAD:
|
||||||
|
# pixel decoder
|
||||||
|
PIXEL_DECODER_NAME: "MaskDINOEncoder"
|
||||||
|
DIM_FEEDFORWARD: 2048
|
||||||
|
NUM_FEATURE_LEVELS: 4
|
||||||
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
||||||
|
IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"]
|
||||||
|
COMMON_STRIDE: 4
|
||||||
|
TRANSFORMER_ENC_LAYERS: 6
|
||||||
|
FEATURE_ORDER: "low2high"
|
||||||
|
DATASETS:
|
||||||
|
TRAIN: ("ytvis_2019_train", )
|
||||||
|
TEST: ("ytvis_2019_val",)
|
||||||
|
SOLVER:
|
||||||
|
IMS_PER_BATCH: 8
|
||||||
|
BASE_LR: 0.0001
|
||||||
|
STEPS: (6000, )
|
||||||
|
MAX_ITER: 8000
|
||||||
|
CHECKPOINT_PERIOD: 2000
|
||||||
|
INPUT:
|
||||||
|
IMAGE_SIZE: 1536
|
||||||
|
MIN_SCALE: 0.1
|
||||||
|
MAX_SCALE: 2.0
|
||||||
|
FORMAT: "RGB"
|
||||||
|
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
||||||
|
TEST:
|
||||||
|
EVAL_PERIOD: 100000
|
||||||
|
DATALOADER:
|
||||||
|
FILTER_EMPTY_ANNOTATIONS: True
|
||||||
|
NUM_WORKERS: 8
|
||||||
|
OUTPUT_DIR: ./GLEE_Pro_ytvis19
|
|
@ -222,8 +222,9 @@ def instances_to_coco_json_video(inputs, outputs):
|
||||||
segms.append(dummy_seg)
|
segms.append(dummy_seg)
|
||||||
_boxes.append(None)
|
_boxes.append(None)
|
||||||
else:
|
else:
|
||||||
segms.append(mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0])
|
rle = mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
|
||||||
if _mask.sum()>5 and _box is not None:
|
segms.append(rle)
|
||||||
|
if mask_util.area(rle)>5 and _box is not None:
|
||||||
_boxes.append(_box.tolist())
|
_boxes.append(_box.tolist())
|
||||||
|
|
||||||
for rle in segms:
|
for rle in segms:
|
||||||
|
|
Loading…
Reference in New Issue