diff --git a/README.md b/README.md
index 634943d..55777dd 100644
--- a/README.md
+++ b/README.md
@@ -6,32 +6,31 @@ We introduce **SEEM** that can **S**egment **E**verything **E**verywhere with **
**One-Line Demo with Linux:**
```sh
-git clone git@github.com:UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git && cd Segment-Everything-Everywhere-All-At-Once/demo_code && sh run_demo.sh
+git clone git@github.com:UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git && sh aasets/scripts/run_demo.sh
```
**Getting Started:**
-[INSTALL.md]()
-[DATASET.md]()
-[TRAIN.md]()
-[EVAL.md]()
+[INSTALL.md](assets/readmes/INSTALL.md)
+[DATASET.md](assets/readmes/DATASET.md)
+[TRAIN.md](assets/readmes/TRAIN.md)
+[EVAL.md](assets/readmes/EVAL.md)
:point_right: *[New]* **Latest Checkpoints and Numbers:**
| | | | COCO | | | Ref-COCOg | | | VOC | | SBD | |
|-----------------|---------------------------------------------------------------------------------------------|----------|------|------|------|-----------|------|------|-------|-------|-------|-------|
| Method | Checkpoint | backbone | PQ ↑ | mAP ↑ | mIoU ↑ | cIoU ↑ | mIoU ↑ | AP50 ↑ | NoC85 ↓ | NoC90 ↓| NoC85 ↓| NoC90 ↓|
-| X-Decoder | [ckpt]() | Focal-T | 50.8 | 39.5 | 62.4 | 57.6 | 63.2 | 71.6 | - | - | - | - |
-| X-Decoder-oq201 | [ckpt]() | Focal-L | 56.5 | 46.7 | 67.2 | 62.8 | 67.5 | 76.3 | - | - | - | - |
-| SEEM_v0 | [ckpt]() | Focal-T | 50.6 | 39.4 | 60.9 | 58.5 | 63.5 | 71.6 | 3.54 | 4.59 | * | * |
+| X-Decoder | [ckpt](https://huggingface.co/xdecoder/X-Decoder/resolve/main/xdecoder_focalt_last.pt) | Focal-T | 50.8 | 39.5 | 62.4 | 57.6 | 63.2 | 71.6 | - | - | - | - |
+| X-Decoder-oq201 | [ckpt](https://huggingface.co/xdecoder/X-Decoder/resolve/main/xdecoder_focall_last.pt) | Focal-L | 56.5 | 46.7 | 67.2 | 62.8 | 67.5 | 76.3 | - | - | - | - |
+| SEEM_v0 | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt) | Focal-T | 50.6 | 39.4 | 60.9 | 58.5 | 63.5 | 71.6 | 3.54 | 4.59 | * | * |
| SEEM_v0 | - | Davit-d3 | 56.2 | 46.8 | 65.3 | 63.2 | 68.3 | 76.6 | 2.99 | 3.89 | 5.93 | 9.23 |
-| SEEM_v0 | [ckpt]() | Focal-L | 56.2 | 46.4 | 65.5 | 62.8 | 67.7 | 76.2 | 3.04 | 3.85 | * | * |
-| SEEM_v1 | [ckpt]() | Focal-T | 50.8 | 39.4 | 60.7 | 58.5 | 63.7 | 72.0 | 3.19 | 4.13 | * | * |
-| SEEM_v1 | [ckpt]() | SAM-ViT-B | 52.0 | 43.5 | 60.2 | 54.1 | 62.2 | 69.3 | 2.53 | 3.23 | * | * |
-| SEEM_v1 | [ckpt]() | SAM-ViT-L | 49.0 | 41.6 | 58.2 | 53.8 | 62.2 | 69.5 | 2.40 | 2.96 | * | * |
-
-SEEM_v0: Supporting Single Interactive object training and inference
-SEEM_v1: Supporting Multiple Interactive objects training and inference
+| SEEM_v0 | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt) | Focal-L | 56.2 | 46.4 | 65.5 | 62.8 | 67.7 | 76.2 | 3.04 | 3.85 | * | * |
+| SEEM_v1 | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v1.pt) | Focal-T | 50.8 | 39.4 | 60.7 | 58.5 | 63.7 | 72.0 | 3.19 | 4.13 | * | * |
+| SEEM_v1 | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_samvitb_v1.pt) | SAM-ViT-B | 52.0 | 43.5 | 60.2 | 54.1 | 62.2 | 69.3 | 2.53 | 3.23 | * | * |
+| SEEM_v1 | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_samvitl_v1.pt) | SAM-ViT-L | 49.0 | 41.6 | 58.2 | 53.8 | 62.2 | 69.5 | 2.40 | 2.96 | * | * |
+**SEEM_v0:** Supporting Single Interactive object training and inference
+**SEEM_v1:** Supporting Multiple Interactive objects training and inference
:fire: **Related projects:**
diff --git a/assets/readmes/DATASET.md b/assets/readmes/DATASET.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/EVAL.md b/assets/readmes/EVAL.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/INSTALL.md b/assets/readmes/INSTALL.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/TRAIN.md b/assets/readmes/TRAIN.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/requirements/requirements.txt b/assets/requirements/requirements.txt
new file mode 100644
index 0000000..1ec8915
--- /dev/null
+++ b/assets/requirements/requirements.txt
@@ -0,0 +1,34 @@
+torch==2.1.0
+torchvision==0.16.0
+pillow==9.4.0
+opencv-python==4.8.1.78
+pyyaml==6.0.1
+json_tricks==3.17.3
+yacs==0.1.8
+scikit-learn==1.3.1
+pandas==2.0.3
+timm==0.4.12
+numpy==1.23.1
+einops==0.7.0
+fvcore==0.1.5.post20221221
+transformers==4.34.0
+sentencepiece==0.1.99
+ftfy==6.1.1
+regex==2023.10.3
+nltk==3.8.1
+mpi4py==3.1.5
+vision-datasets==0.2.2
+cython==3.0.2
+pycocotools==2.0.7
+diffdist==0.1
+pyarrow==13.0.0
+cityscapesscripts==2.2.2
+shapely==1.8.0
+scikit-image==0.21.0
+mup==1.0.0
+accelerate==0.23.0
+kornia==0.7.0
+deepspeed==0.10.3
+wandb==0.15.12
+infinibatch==0.1.1
+gradio==3.42.0
\ No newline at end of file
diff --git a/assets/requirements/requirements_custom.txt b/assets/requirements/requirements_custom.txt
new file mode 100644
index 0000000..d254bf2
--- /dev/null
+++ b/assets/requirements/requirements_custom.txt
@@ -0,0 +1,3 @@
+git+https://github.com/arogozhnikov/einops.git
+git+https://github.com/MaureenZOU/detectron2-xyz.git
+git+https://github.com/openai/whisper.git
\ No newline at end of file
diff --git a/demo/seem/app.py b/demo/seem/app.py
index fb86b91..66d6621 100644
--- a/demo/seem/app.py
+++ b/demo/seem/app.py
@@ -42,14 +42,14 @@ opt = init_distributed(opt)
# META DATA
cur_model = 'None'
if 'focalt' in cfg.conf_files:
- pretrained_pth = os.path.join("seem_focalt_v2.pt")
+ pretrained_pth = os.path.join("seem_focalt_v0.pt")
if not os.path.exists(pretrained_pth):
- os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v2.pt"))
+ os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt"))
cur_model = 'Focal-T'
elif 'focal' in cfg.conf_files:
- pretrained_pth = os.path.join("seem_focall_v1.pt")
+ pretrained_pth = os.path.join("seem_focall_v0.pt")
if not os.path.exists(pretrained_pth):
- os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt"))
+ os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt"))
cur_model = 'Focal-L'
'''
diff --git a/modeling/vision/encoder/__init__.py b/modeling/vision/encoder/__init__.py
index 624fc1d..89af463 100755
--- a/modeling/vision/encoder/__init__.py
+++ b/modeling/vision/encoder/__init__.py
@@ -1,5 +1,8 @@
from .transformer_encoder_fpn import *
-from .transformer_encoder_deform import *
+try:
+ from .transformer_encoder_deform import *
+except:
+ print('Deformable Transformer Encoder is not available.')
from .build import *
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 2eac1ea..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-pillow==9.4.0
-opencv-python
-pyyaml
-json_tricks
-yacs
-scikit-learn
-pandas
-timm==0.4.12
-numpy==1.23.1
-einops
-fvcore
-transformers
-sentencepiece
-ftfy
-regex
-nltk
-mpi4py
-vision-datasets==0.2.2
-cython
-pycocotools
-diffdist
-pyarrow
-cityscapesscripts
-shapely==1.8.0
-scikit-image
-mup
-accelerate
-kornia
-deepspeed
-wandb
-infinibatch
\ No newline at end of file