diff --git a/README.md b/README.md index 69d0dfc..b455418 100644 --- a/README.md +++ b/README.md @@ -125,23 +125,36 @@ This will generate query bank files for each dataset in ODinW in ``MODEL/{datas ### Some paramters corresponding to the query extraction: +The above [script](tools/extract_vision_query.py) has already set all paramters well. One only needs to pass: + +``--config_file`` is the pretraining config files. + +``--dataset`` contains some pre-defined datasets including ``objects365``, ``lvis``, ``odinw-13``, and ``odinw-35``. + +``--num_vision_queries`` controls the number of vision queries for each category you want to extract from the training dataset, and can be an arbitrary number. This will set both ``VISION_QUERY.MAX_QUERY_NUMBER`` and ``DATASETS.FEW_SHOT`` to ``num_vision_queries``. +Note that here ``DATASETS.FEW_SHOT`` is only for accelerating the extraction process. + +``--add_name`` is only a mark for different models. +For training/evaluating with MQ-GLIP-T/MQ-GLIP-L/MQ-GroundingDINO, we set ``--add_name`` to 'tiny'/'large'/'gd'. + +For customized usage, one can modify the commands in the [script](tools/extract_vision_query.py), or pass additional parameters through ``--opt``, for example, +``` +python tools/extract_vision_query.py --config_file configs/pretrain/mq-glip-t.yaml --dataset lvis --opt 'VISION_QUERY.MAX_QUERY_NUMBER 50 DATASETS.FEW_SHOT 50' +``` + +Here are several parameters may be used during query extraction, more details can be found in the [code](maskrcnn_benchmark/config/defaults.py): + ``DATASETS.FEW_SHOT``: if set ``k>0``, the dataset will be subsampled to k-shot for each category when initializing the dataset. This is completed before training. Not used during pre-training. -``VISION_QUERY.MAX_QUERY_NUMBER``: the max number of vision queries for each category when extracting the query bank. Note that the query extraction is conducted before training and evaluation. +``VISION_QUERY.MAX_QUERY_NUMBER``: the max number of vision queries for each category when extracting the query bank. Only used during query extraction. Note that the query extraction is conducted before training and evaluation. -``VISION_QUERY.NUM_QUERY_PER_CLASS`` controls how many queries to provide for each category during one forward process in training and evaluation. +``VISION_QUERY.NUM_QUERY_PER_CLASS`` controls how many queries to provide for each category during one forward process in training and evaluation. Not used during query extraction. Usually, we set ``VISION_QUERY.MAX_QUERY_NUMBER=5000``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=5``, ``DATASETS.FEW_SHOT=0`` during pre-training. -``VISION_QUERY.MAX_QUERY_NUMBER=5``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=5``, ``DATASETS.FEW_SHOT=5`` during few-shot (5-shot) fine-tuning. - -``--num_vision_queries`` denotes number of vision queries for each category, and can be an arbitrary number. This will set both ``VISION_QUERY.MAX_QUERY_NUMBER`` and ``DATASETS.FEW_SHOT`` to ``num_vision_queries``. -Note that here ``DATASETS.FEW_SHOT`` is only for accelerating the extraction process. - -``--add_name`` is only a mark for different models. -For training/evaluating with MQ-GLIP-T/MQ-GLIP-L/MQ-GroundingDINO, we set ``--add_name`` to 'tiny'/'large'/'gd'. +``VISION_QUERY.MAX_QUERY_NUMBER=k``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=k``, ``DATASETS.FEW_SHOT=k`` during few-shot (k-shot) fine-tuning. ## Modulated Training diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py index fff706a..be0d185 100644 --- a/maskrcnn_benchmark/config/defaults.py +++ b/maskrcnn_benchmark/config/defaults.py @@ -893,30 +893,31 @@ _C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first" _C.GLIPKNOW.GPT3_NUM = 5 _C.GLIPKNOW.WIKI_AND_GPT3 = False -# settings of vision query +# ---------------------------------------------------------------------------- # +# Vision query options +# ---------------------------------------------------------------------------- # _C.VISION_QUERY = CN() -# expand bbox for better retrival -_C.VISION_QUERY.ENABLED = False -_C.VISION_QUERY.EXPAND_RATIO = 1.5 -# _C.VISION_QUERY.NUM_TOKENS_PER_INSTANCE = 4 + +_C.VISION_QUERY.ENABLED = False # if set False, the model is equal to GLIP +_C.VISION_QUERY.EXPAND_RATIO = 1.5 # expand the bbox before cropped as a vision query during query extraction _C.VISION_QUERY.DATASET_NAME = "" -_C.VISION_QUERY.MAX_QUERY_NUMBER = 5000 +_C.VISION_QUERY.MAX_QUERY_NUMBER = 5000 # max per-category number of the query bank (used before training) _C.VISION_QUERY.MAX_TEST_QUERY_NUMBER = 100 # for test-time online update _C.VISION_QUERY.SELECT_FPN_LEVEL = True -_C.VISION_QUERY.QUERY_BANK_PATH = "" -_C.VISION_QUERY.PURE_TEXT_RATE = 0. -_C.VISION_QUERY.NUM_QUERY_PER_CLASS = 5 -_C.VISION_QUERY.SHARE_KV = False -_C.VISION_QUERY.TEXT_DROPOUT = 0. -_C.VISION_QUERY.NEW_MASK_TOKEN = False -_C.VISION_QUERY.MASK_DURING_INFERENCE = False +_C.VISION_QUERY.QUERY_BANK_PATH = "" # path to a extracted query bank +_C.VISION_QUERY.PURE_TEXT_RATE = 0. # probability to pass a pure text description without any vision queries for a category +_C.VISION_QUERY.NUM_QUERY_PER_CLASS = 5 # per-category query number for each forward process (used during training) +_C.VISION_QUERY.SHARE_KV = False # only for efficiency +_C.VISION_QUERY.TEXT_DROPOUT = 0. # probability to mask a input category text - the core of vision conditioned masked language prediction +_C.VISION_QUERY.NEW_MASK_TOKEN = False # using a newly initialized mask token rather than the [MASK] token in BERT to mask a word +_C.VISION_QUERY.MASK_DURING_INFERENCE = False # mask word during inference. Only used during single modality evaluation, e.g., only vision queries _C.VISION_QUERY.GATE_REGULARIZATION = False _C.VISION_QUERY.GATE_REGULARIZATION_SCALE = 0.1 _C.VISION_QUERY.FIX_ATTN_GATE = -1.0 _C.VISION_QUERY.VISION_SCALE = 1.0 -_C.VISION_QUERY.RANDOM_KSHOT = False -_C.VISION_QUERY.LEARNABLE_BANK = False -_C.VISION_QUERY.CONDITION_GATE = False +_C.VISION_QUERY.RANDOM_KSHOT = False # randomly select number of shot during modulated pretraining +_C.VISION_QUERY.LEARNABLE_BANK = False # make the extacted query bank learnable, can be used during k-shot finetuning +_C.VISION_QUERY.CONDITION_GATE = False # the models in the paper all set this param to True to enable conditional gates. _C.VISION_QUERY.ADD_VISION_LAYER = False _C.VISION_QUERY.DISABLE_SELECTOR = False # for extract queries _C.VISION_QUERY.ADD_ADAPT_LAYER = False @@ -925,8 +926,7 @@ _C.VISION_QUERY.NONLINEAR_GATE = False _C.VISION_QUERY.SCORE_THRESHOLD = 0.6 # To filter trustable instances for test-time online update _C.VISION_QUERY.SIMILARITY_THRESHOLD = 0.85 # If exclude_similar == True in extract queries, remove features that similar with banks _C.VISION_QUERY.NUM_TURNS = 1 # number of turns to extract test queries -_C.VISION_QUERY.SAVE_ON_CEPH = False # TODO: remove when open source -_C.VISION_QUERY.NO_CAT = False # TODO: remove in formal version +_C.VISION_QUERY.NO_CAT = True # only for debug _C.VISION_QUERY.QUERY_ADDITION_NAME = "" _C.VISION_QUERY.OFFLINE_WITH_ONLINE = False _C.VISION_QUERY.AUGMENT_IMAGE_WITH_QUERY = False @@ -934,10 +934,13 @@ _C.VISION_QUERY.CUSTOM_DATA_IDS = None # only for extract user-oriented vision q _C.VISION_QUERY.CUSTOM_CAT_IDS = None # only for custom evaluation _C.VISION_QUERY.QUERY_BANK_SAVE_PATH = '' _C.VISION_QUERY.RETURN_ATTN_GATE_VALUE = False -_C.VISION_QUERY.DEBUG = False -_C.VISION_QUERY.OWLVIT = False # debug +_C.VISION_QUERY.DEBUG = False # only for debug +_C.VISION_QUERY.OWLVIT = False # only for debug + +# ---------------------------------------------------------------------------- # # GroundingDINO +# ---------------------------------------------------------------------------- # _C.GROUNDINGDINO = CN() _C.GROUNDINGDINO.enabled = False _C.GROUNDINGDINO.modelname = "groundingdino"