diff --git a/README.md b/README.md
index 69d0dfc..b455418 100644
--- a/README.md
+++ b/README.md
@@ -125,23 +125,36 @@ This will generate query bank files for each dataset in ODinW in  ``MODEL/{datas
 
 ### Some paramters corresponding to the query extraction:
 
+The above [script](tools/extract_vision_query.py) has already set all paramters well. One only needs to pass:
+
+``--config_file`` is the pretraining config files.
+
+``--dataset`` contains some pre-defined datasets including ``objects365``, ``lvis``, ``odinw-13``, and ``odinw-35``.
+
+``--num_vision_queries`` controls the number of vision queries for each category you want to extract from the training dataset, and can be an arbitrary number. This will set both ``VISION_QUERY.MAX_QUERY_NUMBER`` and ``DATASETS.FEW_SHOT`` to ``num_vision_queries``.
+Note that here ``DATASETS.FEW_SHOT`` is only for accelerating the extraction process.
+
+``--add_name`` is only a mark for different models.
+For training/evaluating with MQ-GLIP-T/MQ-GLIP-L/MQ-GroundingDINO, we set ``--add_name`` to 'tiny'/'large'/'gd'.
+
+For customized usage, one can modify the commands in the [script](tools/extract_vision_query.py), or pass additional parameters through ``--opt``, for example,
+```
+python tools/extract_vision_query.py --config_file configs/pretrain/mq-glip-t.yaml --dataset lvis --opt 'VISION_QUERY.MAX_QUERY_NUMBER 50 DATASETS.FEW_SHOT 50'
+```
+
+Here are several parameters may be used during query extraction, more details can be found in the [code](maskrcnn_benchmark/config/defaults.py):
+
 ``DATASETS.FEW_SHOT``: if set ``k>0``, the dataset will be subsampled to k-shot for each category when initializing the dataset. This is completed before training. Not used during pre-training.
 
-``VISION_QUERY.MAX_QUERY_NUMBER``: the max number of vision queries for each category when extracting the query bank. Note that the query extraction is conducted before training and evaluation.
+``VISION_QUERY.MAX_QUERY_NUMBER``: the max number of vision queries for each category when extracting the query bank. Only used during query extraction. Note that the query extraction is conducted before training and evaluation.
 
-``VISION_QUERY.NUM_QUERY_PER_CLASS`` controls how many queries to provide for each category during one forward process in training and evaluation.
+``VISION_QUERY.NUM_QUERY_PER_CLASS`` controls how many queries to provide for each category during one forward process in training and evaluation. Not used during query extraction.
 
 Usually, we set 
 
 ``VISION_QUERY.MAX_QUERY_NUMBER=5000``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=5``, ``DATASETS.FEW_SHOT=0`` during pre-training. 
 
-``VISION_QUERY.MAX_QUERY_NUMBER=5``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=5``, ``DATASETS.FEW_SHOT=5`` during few-shot (5-shot) fine-tuning.
-
-``--num_vision_queries`` denotes number of vision queries for each category, and can be an arbitrary number. This will set both ``VISION_QUERY.MAX_QUERY_NUMBER`` and ``DATASETS.FEW_SHOT`` to ``num_vision_queries``.
-Note that here ``DATASETS.FEW_SHOT`` is only for accelerating the extraction process.
-
-``--add_name`` is only a mark for different models.
-For training/evaluating with MQ-GLIP-T/MQ-GLIP-L/MQ-GroundingDINO, we set ``--add_name`` to 'tiny'/'large'/'gd'.
+``VISION_QUERY.MAX_QUERY_NUMBER=k``, ``VISION_QUERY.NUM_QUERY_PER_CLASS=k``, ``DATASETS.FEW_SHOT=k`` during few-shot (k-shot) fine-tuning.
 
 ## Modulated Training
 
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
index fff706a..be0d185 100644
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -893,30 +893,31 @@ _C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
 _C.GLIPKNOW.GPT3_NUM = 5
 _C.GLIPKNOW.WIKI_AND_GPT3 = False
 
-# settings of vision query
+# ---------------------------------------------------------------------------- #
+# Vision query options
+# ---------------------------------------------------------------------------- #
 _C.VISION_QUERY = CN()
-# expand bbox for better retrival
-_C.VISION_QUERY.ENABLED = False
-_C.VISION_QUERY.EXPAND_RATIO = 1.5
-# _C.VISION_QUERY.NUM_TOKENS_PER_INSTANCE = 4
+
+_C.VISION_QUERY.ENABLED = False # if set False, the model is equal to GLIP
+_C.VISION_QUERY.EXPAND_RATIO = 1.5 # expand the bbox before cropped as a vision query during query extraction
 _C.VISION_QUERY.DATASET_NAME = ""
-_C.VISION_QUERY.MAX_QUERY_NUMBER = 5000
+_C.VISION_QUERY.MAX_QUERY_NUMBER = 5000 # max per-category number of the query bank (used before training)
 _C.VISION_QUERY.MAX_TEST_QUERY_NUMBER = 100 # for test-time online update
 _C.VISION_QUERY.SELECT_FPN_LEVEL = True
-_C.VISION_QUERY.QUERY_BANK_PATH = ""
-_C.VISION_QUERY.PURE_TEXT_RATE = 0.
-_C.VISION_QUERY.NUM_QUERY_PER_CLASS = 5
-_C.VISION_QUERY.SHARE_KV = False
-_C.VISION_QUERY.TEXT_DROPOUT = 0.
-_C.VISION_QUERY.NEW_MASK_TOKEN = False
-_C.VISION_QUERY.MASK_DURING_INFERENCE = False
+_C.VISION_QUERY.QUERY_BANK_PATH = "" # path to a extracted query bank
+_C.VISION_QUERY.PURE_TEXT_RATE = 0. # probability to pass a pure text description without any vision queries for a category
+_C.VISION_QUERY.NUM_QUERY_PER_CLASS = 5 # per-category query number for each forward process (used during training)
+_C.VISION_QUERY.SHARE_KV = False # only for efficiency
+_C.VISION_QUERY.TEXT_DROPOUT = 0. # probability to mask a input category text - the core of vision conditioned masked language prediction
+_C.VISION_QUERY.NEW_MASK_TOKEN = False # using a newly initialized mask token rather than the [MASK] token in BERT to mask a word
+_C.VISION_QUERY.MASK_DURING_INFERENCE = False # mask word during inference. Only used during single modality evaluation, e.g., only vision queries
 _C.VISION_QUERY.GATE_REGULARIZATION = False
 _C.VISION_QUERY.GATE_REGULARIZATION_SCALE = 0.1
 _C.VISION_QUERY.FIX_ATTN_GATE = -1.0
 _C.VISION_QUERY.VISION_SCALE = 1.0
-_C.VISION_QUERY.RANDOM_KSHOT = False
-_C.VISION_QUERY.LEARNABLE_BANK = False
-_C.VISION_QUERY.CONDITION_GATE = False
+_C.VISION_QUERY.RANDOM_KSHOT = False # randomly select number of shot during modulated pretraining
+_C.VISION_QUERY.LEARNABLE_BANK = False # make the extacted query bank learnable, can be used during  k-shot finetuning
+_C.VISION_QUERY.CONDITION_GATE = False # the models in the paper all set this param to True to enable conditional gates.
 _C.VISION_QUERY.ADD_VISION_LAYER = False
 _C.VISION_QUERY.DISABLE_SELECTOR = False # for extract queries
 _C.VISION_QUERY.ADD_ADAPT_LAYER = False
@@ -925,8 +926,7 @@ _C.VISION_QUERY.NONLINEAR_GATE = False
 _C.VISION_QUERY.SCORE_THRESHOLD = 0.6 # To filter trustable instances for test-time online update
 _C.VISION_QUERY.SIMILARITY_THRESHOLD = 0.85 # If exclude_similar == True in extract queries, remove features that similar with banks
 _C.VISION_QUERY.NUM_TURNS = 1 # number of turns to extract test queries
-_C.VISION_QUERY.SAVE_ON_CEPH = False # TODO: remove when open source
-_C.VISION_QUERY.NO_CAT = False # TODO: remove in formal version
+_C.VISION_QUERY.NO_CAT = True # only for debug
 _C.VISION_QUERY.QUERY_ADDITION_NAME = ""
 _C.VISION_QUERY.OFFLINE_WITH_ONLINE = False
 _C.VISION_QUERY.AUGMENT_IMAGE_WITH_QUERY = False
@@ -934,10 +934,13 @@ _C.VISION_QUERY.CUSTOM_DATA_IDS = None # only for extract user-oriented vision q
 _C.VISION_QUERY.CUSTOM_CAT_IDS = None # only for custom evaluation
 _C.VISION_QUERY.QUERY_BANK_SAVE_PATH = ''
 _C.VISION_QUERY.RETURN_ATTN_GATE_VALUE = False
-_C.VISION_QUERY.DEBUG = False
-_C.VISION_QUERY.OWLVIT = False # debug
+_C.VISION_QUERY.DEBUG = False # only for debug
+_C.VISION_QUERY.OWLVIT = False # only for debug
 
+
+# ---------------------------------------------------------------------------- #
 # GroundingDINO
+# ---------------------------------------------------------------------------- #
 _C.GROUNDINGDINO = CN()
 _C.GROUNDINGDINO.enabled = False
 _C.GROUNDINGDINO.modelname = "groundingdino"