From 1a35b04624adbe2d2d2d88fd0670550ed0a53c57 Mon Sep 17 00:00:00 2001
From: MaureenZOU <xueyanzoucs@gmail.com>
Date: Wed, 4 Oct 2023 16:55:24 -0500
Subject: [PATCH] update requirements and script

---
 README.md                                   | 29 +++++++++---------
 assets/readmes/DATASET.md                   |  0
 assets/readmes/EVAL.md                      |  0
 assets/readmes/INSTALL.md                   |  0
 assets/readmes/TRAIN.md                     |  0
 assets/requirements/requirements.txt        | 34 +++++++++++++++++++++
 assets/requirements/requirements_custom.txt |  3 ++
 demo/seem/app.py                            |  8 ++---
 modeling/vision/encoder/__init__.py         |  5 ++-
 requirements.txt                            | 31 -------------------
 10 files changed, 59 insertions(+), 51 deletions(-)
 create mode 100644 assets/readmes/DATASET.md
 create mode 100644 assets/readmes/EVAL.md
 create mode 100644 assets/readmes/INSTALL.md
 create mode 100644 assets/readmes/TRAIN.md
 create mode 100644 assets/requirements/requirements.txt
 create mode 100644 assets/requirements/requirements_custom.txt
 delete mode 100644 requirements.txt
diff --git a/README.md b/README.md
index 634943d..55777dd 100644
--- a/README.md
+++ b/README.md
@@ -6,32 +6,31 @@ We introduce **SEEM** that can **S**egment **E**verything **E**verywhere with **
 
 **One-Line Demo with Linux:**
 ```sh
-git clone git@github.com:UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git && cd Segment-Everything-Everywhere-All-At-Once/demo_code && sh run_demo.sh
+git clone git@github.com:UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git && sh aasets/scripts/run_demo.sh
 ```
 
 **Getting Started:**
 
-[INSTALL.md]() <br>
-[DATASET.md]() <br>
-[TRAIN.md]() <br>
-[EVAL.md]()
+[INSTALL.md](assets/readmes/INSTALL.md) <br>
+[DATASET.md](assets/readmes/DATASET.md) <br>
+[TRAIN.md](assets/readmes/TRAIN.md) <br>
+[EVAL.md](assets/readmes/EVAL.md)
 
 :point_right: *[New]* **Latest Checkpoints and Numbers:**
 |                 |                                                                                      |          | COCO |      |      | Ref-COCOg |      |      | VOC   |       | SBD   |       |
 |-----------------|---------------------------------------------------------------------------------------------|----------|------|------|------|-----------|------|------|-------|-------|-------|-------|
 | Method          |  Checkpoint                                                                                  | backbone | PQ &uarr;  | mAP &uarr; | mIoU &uarr; | cIoU  &uarr; | mIoU &uarr; | AP50 &uarr; | NoC85 &darr; | NoC90 &darr;| NoC85 &darr;| NoC90 &darr;|
-| X-Decoder       |  [ckpt]() | Focal-T  | 50.8 | 39.5 | 62.4 | 57.6      | 63.2 | 71.6 | -     | -     | -     | -     |
-| X-Decoder-oq201 |  [ckpt]() | Focal-L  | 56.5 | 46.7 | 67.2 | 62.8      | 67.5 | 76.3 | -     | -     | -     | -     |
-| SEEM_v0            | [ckpt]()      | Focal-T  | 50.6 | 39.4 | 60.9 | 58.5      | 63.5 | 71.6 | 3.54  | 4.59  | *     | *     |
+| X-Decoder       |  [ckpt](https://huggingface.co/xdecoder/X-Decoder/resolve/main/xdecoder_focalt_last.pt) | Focal-T  | 50.8 | 39.5 | 62.4 | 57.6      | 63.2 | 71.6 | -     | -     | -     | -     |
+| X-Decoder-oq201 |  [ckpt](https://huggingface.co/xdecoder/X-Decoder/resolve/main/xdecoder_focall_last.pt) | Focal-L  | 56.5 | 46.7 | 67.2 | 62.8      | 67.5 | 76.3 | -     | -     | -     | -     |
+| SEEM_v0            | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt)      | Focal-T  | 50.6 | 39.4 | 60.9 | 58.5      | 63.5 | 71.6 | 3.54  | 4.59  | *     | *     |
 | SEEM_v0            |  -                                                                                           | Davit-d3 | 56.2 | 46.8 | 65.3 | 63.2      | 68.3 | 76.6 | 2.99  | 3.89  | 5.93  | 9.23  |
-| SEEM_v0      | [ckpt]()       | Focal-L  | 56.2 | 46.4 | 65.5 | 62.8      | 67.7 | 76.2 | 3.04  | 3.85  | *     | *     |
-| SEEM_v1      | [ckpt]()       | Focal-T  | 50.8 | 39.4 | 60.7 |   58.5    |  63.7 | 72.0 | 3.19  | 4.13  | *     | *     |
-| SEEM_v1      | [ckpt]()       | SAM-ViT-B  | 52.0 | 43.5 | 60.2 | 54.1      | 62.2 | 69.3 | 2.53  | 3.23  | *     | *     |
-| SEEM_v1       | [ckpt]()       | SAM-ViT-L  | 49.0 | 41.6 | 58.2 | 53.8      | 62.2 | 69.5 | 2.40  | 2.96  | *     | *     |
-
-SEEM_v0: Supporting Single Interactive object training and inference <br>
-SEEM_v1: Supporting Multiple Interactive objects training and inference
+| SEEM_v0      | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt)       | Focal-L  | 56.2 | 46.4 | 65.5 | 62.8      | 67.7 | 76.2 | 3.04  | 3.85  | *     | *     |
+| SEEM_v1      | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v1.pt)       | Focal-T  | 50.8 | 39.4 | 60.7 |   58.5    |  63.7 | 72.0 | 3.19  | 4.13  | *     | *     |
+| SEEM_v1      | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_samvitb_v1.pt)       | SAM-ViT-B  | 52.0 | 43.5 | 60.2 | 54.1      | 62.2 | 69.3 | 2.53  | 3.23  | *     | *     |
+| SEEM_v1       | [ckpt](https://huggingface.co/xdecoder/SEEM/resolve/main/seem_samvitl_v1.pt)       | SAM-ViT-L  | 49.0 | 41.6 | 58.2 | 53.8      | 62.2 | 69.5 | 2.40  | 2.96  | *     | *     |
 
+**SEEM_v0:** Supporting Single Interactive object training and inference <br>
+**SEEM_v1:** Supporting Multiple Interactive objects training and inference
 
 :fire: **Related projects:**
 
diff --git a/assets/readmes/DATASET.md b/assets/readmes/DATASET.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/EVAL.md b/assets/readmes/EVAL.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/INSTALL.md b/assets/readmes/INSTALL.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/readmes/TRAIN.md b/assets/readmes/TRAIN.md
new file mode 100644
index 0000000..e69de29
diff --git a/assets/requirements/requirements.txt b/assets/requirements/requirements.txt
new file mode 100644
index 0000000..1ec8915
--- /dev/null
+++ b/assets/requirements/requirements.txt
@@ -0,0 +1,34 @@
+torch==2.1.0
+torchvision==0.16.0
+pillow==9.4.0
+opencv-python==4.8.1.78
+pyyaml==6.0.1
+json_tricks==3.17.3
+yacs==0.1.8
+scikit-learn==1.3.1
+pandas==2.0.3
+timm==0.4.12
+numpy==1.23.1
+einops==0.7.0
+fvcore==0.1.5.post20221221
+transformers==4.34.0
+sentencepiece==0.1.99
+ftfy==6.1.1
+regex==2023.10.3
+nltk==3.8.1
+mpi4py==3.1.5
+vision-datasets==0.2.2
+cython==3.0.2
+pycocotools==2.0.7
+diffdist==0.1
+pyarrow==13.0.0
+cityscapesscripts==2.2.2
+shapely==1.8.0
+scikit-image==0.21.0
+mup==1.0.0
+accelerate==0.23.0
+kornia==0.7.0
+deepspeed==0.10.3
+wandb==0.15.12
+infinibatch==0.1.1
+gradio==3.42.0
\ No newline at end of file
diff --git a/assets/requirements/requirements_custom.txt b/assets/requirements/requirements_custom.txt
new file mode 100644
index 0000000..d254bf2
--- /dev/null
+++ b/assets/requirements/requirements_custom.txt
@@ -0,0 +1,3 @@
+git+https://github.com/arogozhnikov/einops.git
+git+https://github.com/MaureenZOU/detectron2-xyz.git
+git+https://github.com/openai/whisper.git
\ No newline at end of file
diff --git a/demo/seem/app.py b/demo/seem/app.py
index fb86b91..66d6621 100644
--- a/demo/seem/app.py
+++ b/demo/seem/app.py
@@ -42,14 +42,14 @@ opt = init_distributed(opt)
 # META DATA
 cur_model = 'None'
 if 'focalt' in cfg.conf_files:
-    pretrained_pth = os.path.join("seem_focalt_v2.pt")
+    pretrained_pth = os.path.join("seem_focalt_v0.pt")
     if not os.path.exists(pretrained_pth):
-        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v2.pt"))
+        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt"))
     cur_model = 'Focal-T'
 elif 'focal' in cfg.conf_files:
-    pretrained_pth = os.path.join("seem_focall_v1.pt")
+    pretrained_pth = os.path.join("seem_focall_v0.pt")
     if not os.path.exists(pretrained_pth):
-        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt"))
+        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt"))
     cur_model = 'Focal-L'
 
 '''
diff --git a/modeling/vision/encoder/__init__.py b/modeling/vision/encoder/__init__.py
index 624fc1d..89af463 100755
--- a/modeling/vision/encoder/__init__.py
+++ b/modeling/vision/encoder/__init__.py
@@ -1,5 +1,8 @@
 from .transformer_encoder_fpn import *
-from .transformer_encoder_deform import *
+try:
+    from .transformer_encoder_deform import *
+except:
+    print('Deformable Transformer Encoder is not available.')
 from .build import *
 
 
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 2eac1ea..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-pillow==9.4.0
-opencv-python
-pyyaml
-json_tricks
-yacs
-scikit-learn
-pandas
-timm==0.4.12
-numpy==1.23.1
-einops
-fvcore
-transformers
-sentencepiece
-ftfy
-regex
-nltk
-mpi4py
-vision-datasets==0.2.2
-cython
-pycocotools
-diffdist
-pyarrow
-cityscapesscripts
-shapely==1.8.0
-scikit-image
-mup
-accelerate
-kornia
-deepspeed
-wandb
-infinibatch
\ No newline at end of file