diff --git a/README.md b/README.md
index d0d1171..d5ab17f 100644
--- a/README.md
+++ b/README.md
@@ -67,13 +67,15 @@ We do detection evaluation on COCO val2017.
 Process visual prompt embeddings for inference. We calculate the all the instance prompt embeddings of the validate set (you can also use the training set, but the processing time is much longer) and store them. Then we infrence by randomly selecting some visual prompts as in-context examples.
 * Infenrence script to get and store visual prompts
 ```shell
-python train_net.py --eval_only --resume --eval_get_content_features --num-gpus 8 --config-file /path/to/configs COCO.TEST.BATCH_SIZE_TOTAL=8 OUTPUT_DIR=$outdir MODEL.WEIGHTS=/path/to/weights
+python train_net.py --eval_only --resume --eval_get_content_features --num-gpus 8 --config-file /path/to/configs COCO.TEST.BATCH_SIZE_TOTAL=8 MODEL.WEIGHTS=/path/to/weights OUTPUT_DIR=/path/to/outputs
 ```
 * Inference script for open-set detection on COCO with visual prompts
 ```shell
-python train_net.py --eval_only --resume --eval_visual_openset --num-gpus 8 --config-file /path/to/configs COCO.TEST.BATCH_SIZE_TOTAL=8 OUTPUT_DIR=$outdir MODEL.WEIGHTS=/path/to/weights MODEL.DECODER.INFERENCE_EXAMPLE=16
+python train_net.py --eval_only --resume --eval_visual_openset --num-gpus 8 --config-file /path/to/configs COCO.TEST.BATCH_SIZE_TOTAL=8 MODEL.WEIGHTS=/path/to/weights MODEL.DECODER.INFERENCE_EXAMPLE=16 OUTPUT_DIR=/path/to/outputs
 ```
-configs to use are `configs/dinov_sam_coco_train.yaml` for swinT and `configs/dinov_sam_coco_swinl_train.yaml` for swinL.
+* **configs** to use are `configs/dinov_sam_coco_train.yaml` for swinT and `configs/dinov_sam_coco_swinl_train.yaml` for swinL.
+* `OUTPUT_DIR` is the dir to store the visual prompt embeddings
+* `INFERENCE_EXAMPLE` number of in-context examples to represent a category. Default set to 16.
 ### :star: Training 
 We currently release the code of training on SA-1B and COCO. It can also support Objects365 and other datasets with minimal modifications. 
 `$n` is the number of gpus you use
@@ -91,14 +93,15 @@ We recommend using total batchsize `64` for training, which provides enough post
 
 For SwinT backbone
 ```shell
-python train_net.py --resume --num-gpus 8 --config-file configs/dinov_sam_coco_train.yaml SAM.TRAIN.BATCH_SIZE_TOTAL=8 COCO.TRAIN.BATCH_SIZE_TOTAL=64
+python train_net.py --resume --num-gpus 8 --config-file configs/dinov_sam_coco_train.yaml SAM.TRAIN.BATCH_SIZE_TOTAL=8 COCO.TRAIN.BATCH_SIZE_TOTAL=8
 ```
 For SwinL backbone
 ```shell
-python train_net.py --resume --num-gpus 8 --config-file configs/dinov_sam_coco_swinl_train.yaml SAM.TRAIN.BATCH_SIZE_TOTAL=8 COCO.TRAIN.BATCH_SIZE_TOTAL=64
+python train_net.py --resume --num-gpus 8 --config-file configs/dinov_sam_coco_swinl_train.yaml SAM.TRAIN.BATCH_SIZE_TOTAL=8 COCO.TRAIN.BATCH_SIZE_TOTAL=8
 ```
-* Please use multi-node training if your gpu cannot handle batch 64 in one node.
-* By default, we do not use COCO data for referring segmentation training. You can set `MODEL.DECODER.COCO_TRACK=True` to enable this task, which can improve the referring segmentation performance on DAVIS. However, we did not implement multi-image training for this task, which mean you can only put **one image on a gpu** for this task.
+* Please use multi-node training, i.e, 64 gpu for batchsize 64, where each gpu handle one sam image and one coco image.
+* By default, we do not use COCO data for referring segmentation training. You can set `MODEL.DECODER.COCO_TRACK=True` to enable this task, which can improve the referring segmentation performance on DAVIS. 
+* We did not implement multi-image training for this task, which mean you can only put **one image on a gpu** for this task.
  
 # Model framework
 ![framework](https://github.com/UX-Decoder/DINOv/assets/34880758/8c756028-a7bd-42dc-8aa7-e6773fd60711)
diff --git a/dinov/architectures/dinov.py b/dinov/architectures/dinov.py
index e7114de..ed7f151 100644
--- a/dinov/architectures/dinov.py
+++ b/dinov/architectures/dinov.py
@@ -1051,7 +1051,6 @@ class DINOv(nn.Module):
         if empty_flag:
             for i, target in enumerate(new_targets):
                 target['fake'] = True
-        print("cross_gpu, new_targets ", cross_gpu, len(new_targets))
         if not empty_flag and not cross_gpu:
             # handle batch in 1 gpu only, do not need cross gpu sync
             # if cross_gpu=True, sync will be performed in the decoder
diff --git a/train_net.py b/train_net.py
index 5010383..831b465 100644
--- a/train_net.py
+++ b/train_net.py
@@ -423,7 +423,8 @@ class Trainer(DefaultTrainer):
         dataset_names = cfg['DATASETS']['TEST']
         weight_path = cfg['MODEL']['WEIGHTS']
         ckpt = weight_path.split('/')
-        output_dir_ = cfg['OUTPUT_DIR']+'_'+ckpt[-1]
+        # output_dir_ = cfg['OUTPUT_DIR']+'_'+ckpt[-1]
+        output_dir_ = cfg['OUTPUT_DIR']
         if comm.is_main_process() and not os.path.exists(output_dir_):
             os.mkdir(output_dir_)
         model = model.eval().cuda()
@@ -494,9 +495,8 @@ class Trainer(DefaultTrainer):
                 dir_name = dataset_name.split('_')[1]
             else:
                 dir_name = dataset_name.replace('train', 'val')
-            output_dir = output_dir_
-            if 'coco' not in output_dir:
-                output_dir = os.path.join(output_dir_, dir_name)
+            # output_dir = output_dir_
+            output_dir = os.path.join(output_dir_, dir_name)
             model_without_ddp.model.sem_seg_head.predictor.out_dir = output_dir
             # build evaluator
             evaluator = build_evaluator(cfg, dataset_name, cfg['OUTPUT_DIR'])