From 80e8504956e8dd42187cc893e58e99a3573c55a6 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Tue, 25 Jan 2022 20:45:39 +0800
Subject: [PATCH] [Doc] Update `README.md` in configs according to latest
 standard. (#1233)

* fix README.md in configs

* fix README.md in configs

* modify [ALGORITHM] to [BACKBONE] in backbone config README.md
---
 .dev/md2yml.py                  | 14 ++++----------
 README.md                       |  2 +-
 README_zh-CN.md                 |  2 +-
 configs/ann/README.md           | 11 ++++++-----
 configs/apcnet/README.md        | 11 +++++------
 configs/bisenetv1/README.md     | 11 +++++------
 configs/bisenetv2/README.md     | 10 +++++-----
 configs/ccnet/README.md         | 10 +++++-----
 configs/cgnet/README.md         | 11 +++++------
 configs/cgnet/cgnet.yml         |  2 +-
 configs/danet/README.md         | 11 +++++------
 configs/deeplabv3/README.md     | 16 ++++++----------
 configs/deeplabv3plus/README.md | 18 +++++++-----------
 configs/dmnet/README.md         | 11 +++++------
 configs/dnlnet/README.md        | 11 +++++------
 configs/dpt/README.md           | 11 +++++------
 configs/emanet/README.md        | 11 +++++------
 configs/encnet/README.md        | 11 +++++------
 configs/erfnet/README.md        | 11 +++++------
 configs/fastfcn/README.md       | 11 +++++------
 configs/fastscnn/README.md      | 11 +++++------
 configs/fcn/README.md           | 11 +++++------
 configs/gcnet/README.md         | 11 +++++------
 configs/hrnet/README.md         | 13 ++++++-------
 configs/icnet/README.md         | 11 +++++------
 configs/isanet/README.md        | 14 ++++++++------
 configs/mobilenet_v2/README.md  | 13 ++++++-------
 configs/mobilenet_v3/README.md  | 13 ++++++-------
 configs/nonlocal_net/README.md  | 11 +++++------
 configs/ocrnet/README.md        | 11 +++++------
 configs/point_rend/README.md    | 11 +++++------
 configs/psanet/README.md        | 11 +++++------
 configs/pspnet/README.md        | 11 +++++------
 configs/resnest/README.md       | 13 ++++++-------
 configs/segformer/README.md     | 11 +++++------
 configs/segformer/segformer.yml |  3 ++-
 configs/sem_fpn/README.md       | 27 +++++++++++----------------
 configs/setr/README.md          | 11 +++++------
 configs/stdc/README.md          | 11 +++++------
 configs/swin/README.md          | 13 ++++++-------
 configs/twins/README.md         | 13 ++++++-------
 configs/unet/README.md          | 11 +++++------
 configs/upernet/README.md       | 11 +++++------
 configs/vit/README.md           | 11 +++++------
 configs/vit/vit.yml             |  2 +-
 45 files changed, 224 insertions(+), 271 deletions(-)

diff --git a/.dev/md2yml.py b/.dev/md2yml.py
index 4f7b876b8..4c2e129f2 100755
--- a/.dev/md2yml.py
+++ b/.dev/md2yml.py
@@ -87,12 +87,13 @@ def parse_md(md_file):
         current_dataset = ''
         while i < len(lines):
             line = lines[i].strip()
+            # In latest README.md the title and url are in the third line.
+            if i == 2:
+                paper_url = lines[i].split('](')[1].split(')')[0]
+                paper_title = lines[i].split('](')[0].split('[')[1]
             if len(line) == 0:
                 i += 1
                 continue
-            if line[:2] == '# ':
-                paper_title = line.replace('# ', '')
-                i += 1
             elif line[:3] == '<a ':
                 content = etree.HTML(line)
                 node = content.xpath('//a')[0]
@@ -112,13 +113,6 @@ def parse_md(md_file):
                     assert repo_url is not None, (
                         f'{collection_name} hasn\'t official repo url.')
                 i += 1
-            elif line[:9] == '<summary ':
-                content = etree.HTML(line)
-                nodes = content.xpath('//a')
-                assert len(nodes) == 1, (
-                    'summary tag should only have single a tag.')
-                paper_url = nodes[0].get('href', None)
-                i += 1
             elif line[:4] == '### ':
                 datasets.append(line[4:])
                 current_dataset = line[4:]
diff --git a/README.md b/README.md
index e0ad4f781..6ffc91ebc 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ A Colab tutorial is also provided. You may preview the notebook [here](demo/MMSe
 
 If you find this project useful in your research, please consider cite:
 
-```latex
+```bibtex
 @misc{mmseg2020,
     title={{MMSegmentation}: OpenMMLab Semantic Segmentation Toolbox and Benchmark},
     author={MMSegmentation Contributors},
diff --git a/README_zh-CN.md b/README_zh-CN.md
index c4b40fc58..0b0503f98 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -153,7 +153,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 
 如果你觉得本项目对你的研究工作有所帮助，请参考如下 bibtex 引用 MMSegmentation。
 
-```latex
+```bibtex
 @misc{mmseg2020,
     title={{MMSegmentation}: OpenMMLab Semantic Segmentation Toolbox and Benchmark},
     author={MMSegmentation Contributors},
diff --git a/configs/ann/README.md b/configs/ann/README.md
index d10978f18..30a59c3c3 100644
--- a/configs/ann/README.md
+++ b/configs/ann/README.md
@@ -1,4 +1,6 @@
-# Asymmetric Non-local Neural Networks for Semantic Segmentation
+# ANN
+
+[Asymmetric Non-local Neural Networks for Semantic Segmentation](https://arxiv.org/abs/1908.07678)
 
 ## Introduction
 
@@ -19,10 +21,10 @@ The non-local module works as a particularly useful technique for semantic segme
 <img src="https://user-images.githubusercontent.com/24582831/142898322-3bbd578c-e488-4bae-9c14-7598adac5cbd.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1908.07678">ANN (ICCV'2019)</a></summary>
 
-```latex
+## Citation
+
+```bibtex
 @inproceedings{zhu2019asymmetric,
   title={Asymmetric non-local neural networks for semantic segmentation},
   author={Zhu, Zhen and Xu, Mengde and Bai, Song and Huang, Tengteng and Bai, Xiang},
@@ -32,7 +34,6 @@ The non-local module works as a particularly useful technique for semantic segme
 }
 ```
 
-</details>
 
 ## Results and models
 
diff --git a/configs/apcnet/README.md b/configs/apcnet/README.md
index 9ebb090e2..5e1fd6b42 100644
--- a/configs/apcnet/README.md
+++ b/configs/apcnet/README.md
@@ -1,4 +1,6 @@
-# Adaptive Pyramid Context Network for Semantic Segmentation
+# APCNet
+
+[Adaptive Pyramid Context Network for Semantic Segmentation](https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Recent studies witnessed that context features can significantly improve the per
 <img src="https://user-images.githubusercontent.com/24582831/142898638-e1c0c6ae-9270-448e-aa01-bbac3a236db5.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html">APCNet (CVPR'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @InProceedings{He_2019_CVPR,
 author = {He, Junjun and Deng, Zhongying and Zhou, Lei and Wang, Yali and Qiao, Yu},
 title = {Adaptive Pyramid Context Network for Semantic Segmentation},
@@ -32,8 +33,6 @@ year = {2019}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
index 2438e902d..7a9e5faa9 100644
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@@ -1,4 +1,6 @@
-# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation
+# BiSeNetV1
+
+[BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Semantic segmentation requires both rich spatial information and sizeable recept
 <img src="https://user-images.githubusercontent.com/24582831/142898839-a0a78148-848a-41b2-8682-b1f61ac004ba.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1808.00897">BiSeNetV1 (ECCV'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{yu2018bisenet,
   title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
   author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
@@ -32,8 +33,6 @@ Semantic segmentation requires both rich spatial information and sizeable recept
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/bisenetv2/README.md b/configs/bisenetv2/README.md
index 825a07a0d..1bc742490 100644
--- a/configs/bisenetv2/README.md
+++ b/configs/bisenetv2/README.md
@@ -1,4 +1,6 @@
-# Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation
+# BiSeNetV2
+
+[Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation](https://arxiv.org/abs/2004.02147)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ The low-level details and high-level semantics are both essential to the semanti
 <img src="https://user-images.githubusercontent.com/24582831/142898966-ec4a81da-b4b0-41ee-b083-1d964582c18a.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2004.02147">BiSeNetV2 (IJCV'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{yu2021bisenet,
   title={Bisenet v2: Bilateral network with guided aggregation for real-time semantic segmentation},
   author={Yu, Changqian and Gao, Changxin and Wang, Jingbo and Yu, Gang and Shen, Chunhua and Sang, Nong},
@@ -33,7 +34,6 @@ The low-level details and high-level semantics are both essential to the semanti
 }
 ```
 
-</details>
 
 ## Results and models
 
diff --git a/configs/ccnet/README.md b/configs/ccnet/README.md
index bf318f6ce..9cefcf023 100644
--- a/configs/ccnet/README.md
+++ b/configs/ccnet/README.md
@@ -1,4 +1,6 @@
-# CCNet: Criss-Cross Attention for Semantic Segmentation
+# CCNet
+
+[CCNet: Criss-Cross Attention for Semantic Segmentation](https://arxiv.org/abs/1811.11721)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Contextual information is vital in visual understanding problems, such as semant
 <img src="https://user-images.githubusercontent.com/24582831/142899159-b329c12a-0fde-44df-8718-def6cfb004e4.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1811.11721">CCNet (ICCV'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{huang2018ccnet,
     title={CCNet: Criss-Cross Attention for Semantic Segmentation},
     author={Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
@@ -31,7 +32,6 @@ Contextual information is vital in visual understanding problems, such as semant
 }
 ```
 
-</details>
 
 ## Results and models
 
diff --git a/configs/cgnet/README.md b/configs/cgnet/README.md
index 69d46888d..fefb29140 100644
--- a/configs/cgnet/README.md
+++ b/configs/cgnet/README.md
@@ -1,4 +1,6 @@
-# CGNet: A Light-weight Context Guided Network for Semantic Segmentation
+# CGNet
+
+[CGNet: A Light-weight Context Guided Network for Semantic Segmentation](https://arxiv.org/abs/1811.08201)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ The demand of applying semantic segmentation model on mobile devices has been in
 <img src="https://user-images.githubusercontent.com/24582831/142900351-89559574-79cc-4f57-8f69-5d88765ec38d.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/pdf/1811.08201.pdf">CGNet (TIP'2020)</a></summary>
+## Citation
 
-```latext
+```bibtext
 @article{wu2020cgnet,
   title={Cgnet: A light-weight context guided network for semantic segmentation},
   author={Wu, Tianyi and Tang, Sheng and Zhang, Rui and Cao, Juan and Zhang, Yongdong},
@@ -34,8 +35,6 @@ The demand of applying semantic segmentation model on mobile devices has been in
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/cgnet/cgnet.yml b/configs/cgnet/cgnet.yml
index ebb3760dd..b1506e0fe 100644
--- a/configs/cgnet/cgnet.yml
+++ b/configs/cgnet/cgnet.yml
@@ -4,7 +4,7 @@ Collections:
     Training Data:
     - Cityscapes
   Paper:
-    URL: https://arxiv.org/pdf/1811.08201.pdf
+    URL: https://arxiv.org/abs/1811.08201
     Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
   README: configs/cgnet/README.md
   Code:
diff --git a/configs/danet/README.md b/configs/danet/README.md
index 69bc98a6f..411c59562 100644
--- a/configs/danet/README.md
+++ b/configs/danet/README.md
@@ -1,4 +1,6 @@
-# Dual Attention Network for Scene Segmentation
+# DANet
+
+[Dual Attention Network for Scene Segmentation](https://arxiv.org/abs/1809.02983)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ In this paper, we address the scene segmentation task by capturing rich contextu
 <img src="https://user-images.githubusercontent.com/24582831/142900467-f832fdb9-3b7d-47d3-8e80-e6ee9303bdfb.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1809.02983">DANet (CVPR'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{fu2018dual,
   title={Dual Attention Network for Scene Segmentation},
   author={Jun Fu, Jing Liu, Haijie Tian, Yong Li, Yongjun Bao, Zhiwei Fang,and Hanqing Lu},
@@ -31,8 +32,6 @@ In this paper, we address the scene segmentation task by capturing rich contextu
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/deeplabv3/README.md b/configs/deeplabv3/README.md
index 6006839ac..a5d85a5ef 100644
--- a/configs/deeplabv3/README.md
+++ b/configs/deeplabv3/README.md
@@ -1,4 +1,6 @@
-# Rethinking atrous convolution for semantic image segmentation
+# DeepLabV3
+
+[Rethinking atrous convolution for semantic image segmentation](https://arxiv.org/abs/1706.05587)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ In this work, we revisit atrous convolution, a powerful tool to explicitly adjus
 <img src="https://user-images.githubusercontent.com/24582831/142900575-f30a7755-09aa-406a-bf78-45893a61ee9a.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1706.05587">DeepLabV3 (ArXiv'2017)</a></summary>
+## Citation
 
-```latext
+```bibtext
 @article{chen2017rethinking,
   title={Rethinking atrous convolution for semantic image segmentation},
   author={Chen, Liang-Chieh and Papandreou, George and Schroff, Florian and Adam, Hartwig},
@@ -31,14 +32,8 @@ In this work, we revisit atrous convolution, a powerful tool to explicitly adjus
 }
 ```
 
-</details>
-
 ## Results and models
 
-:::{note}
-`D-8` here corresponding to the output stride 8 setting for DeepLab series.
-:::
-
 ### Cityscapes
 
 | Method    | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                   |
@@ -117,4 +112,5 @@ In this work, we revisit atrous convolution, a powerful tool to explicitly adjus
 
 Note:
 
+- `D-8` here corresponding to the output stride 8 setting for DeepLab series.
 - `FP16` means Mixed Precision (FP16) is adopted in training.
diff --git a/configs/deeplabv3plus/README.md b/configs/deeplabv3plus/README.md
index a36ec8968..91b66dd50 100644
--- a/configs/deeplabv3plus/README.md
+++ b/configs/deeplabv3plus/README.md
@@ -1,4 +1,6 @@
-# Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+# DeepLabV3+
+
+[Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Spatial pyramid pooling module or encode-decoder structure are used in deep neur
 <img src="https://user-images.githubusercontent.com/24582831/142900680-3e2c3098-8341-4760-bbfd-b1d7d29968ea.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1802.02611">DeepLabV3+ (CVPR'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{deeplabv3plus2018,
   title={Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation},
   author={Liang-Chieh Chen and Yukun Zhu and George Papandreou and Florian Schroff and Hartwig Adam},
@@ -31,15 +32,8 @@ Spatial pyramid pooling module or encode-decoder structure are used in deep neur
 }
 ```
 
-</details>
-
 ## Results and models
 
-:::{note}
-`D-8`/`D-16` here corresponding to the output stride 8/16 setting for DeepLab series.
-`MG-124` stands for multi-grid dilation in the last stage of ResNet.
-:::
-
 ### Cityscapes
 
 | Method     | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                                                                                           |
@@ -122,4 +116,6 @@ Spatial pyramid pooling module or encode-decoder structure are used in deep neur
 
 Note:
 
+- `D-8`/`D-16` here corresponding to the output stride 8/16 setting for DeepLab series.
+- `MG-124` stands for multi-grid dilation in the last stage of ResNet.
 - `FP16` means Mixed Precision (FP16) is adopted in training.
diff --git a/configs/dmnet/README.md b/configs/dmnet/README.md
index 1bb497a13..0729268ca 100644
--- a/configs/dmnet/README.md
+++ b/configs/dmnet/README.md
@@ -1,4 +1,6 @@
-# Dynamic Multi-scale Filters for Semantic Segmentation
+# DMNet
+
+[Dynamic Multi-scale Filters for Semantic Segmentation](https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Multi-scale representation provides an effective way toaddress scale variation o
 <img src="https://user-images.githubusercontent.com/24582831/142900781-6215763f-8b71-4e0b-a6b1-c41372db2aa0.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf">DMNet (ICCV'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @InProceedings{He_2019_ICCV,
 author = {He, Junjun and Deng, Zhongying and Qiao, Yu},
 title = {Dynamic Multi-Scale Filters for Semantic Segmentation},
@@ -32,8 +33,6 @@ year = {2019}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/dnlnet/README.md b/configs/dnlnet/README.md
index f8c6e6c75..ff335113d 100644
--- a/configs/dnlnet/README.md
+++ b/configs/dnlnet/README.md
@@ -1,4 +1,6 @@
-# Disentangled Non-Local Neural Networks
+# DNLNet
+
+[Disentangled Non-Local Neural Networks](https://arxiv.org/abs/2006.06668)
 
 ## Introduction
 
@@ -19,14 +21,13 @@ The non-local block is a popular module for strengthening the context modeling a
 <img src="https://user-images.githubusercontent.com/24582831/142900944-b8d93301-d2ce-488e-a461-b0813f96be49.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2006.06668">DNLNet (ECCV'2020)</a></summary>
+## Citation
 
 This example is to reproduce ["Disentangled Non-Local Neural Networks"](https://arxiv.org/abs/2006.06668) for semantic segmentation. It is still in progress.
 
 ## Citation
 
-```latex
+```bibtex
 @misc{yin2020disentangled,
     title={Disentangled Non-Local Neural Networks},
     author={Minghao Yin and Zhuliang Yao and Yue Cao and Xiu Li and Zheng Zhang and Stephen Lin and Han Hu},
@@ -35,8 +36,6 @@ This example is to reproduce ["Disentangled Non-Local Neural Networks"](https://
 }
 ```
 
-</details>
-
 ## Results and models (in progress)
 
 ### Cityscapes
diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 63d59bcc7..2fd8d32a4 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -1,4 +1,6 @@
-# Vision Transformer for Dense Prediction
+# DPT
+
+[Vision Transformer for Dense Prediction](https://arxiv.org/abs/2103.13413)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ We introduce dense vision transformers, an architecture that leverages vision tr
 <img src="https://user-images.githubusercontent.com/24582831/142901057-00aabea5-dab4-43d3-a14a-5f73eb5dd9b9.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2103.13413">DPT (ArXiv'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{dosoViTskiy2020,
   title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
   author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
@@ -38,8 +39,6 @@ We introduce dense vision transformers, an architecture that leverages vision tr
 }
 ```
 
-</details>
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
diff --git a/configs/emanet/README.md b/configs/emanet/README.md
index 73dea783d..34dba42ec 100644
--- a/configs/emanet/README.md
+++ b/configs/emanet/README.md
@@ -1,4 +1,6 @@
-# Expectation-Maximization Attention Networks for Semantic Segmentation
+# EMANet
+
+[Expectation-Maximization Attention Networks for Semantic Segmentation](https://arxiv.org/abs/1907.13426)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Self-attention mechanism has been widely used for various tasks. It is designed
 <img src="https://user-images.githubusercontent.com/24582831/142901186-7bfe15e2-805a-420e-81b0-74f214f20a36.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1907.13426">EMANet (ICCV'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{li2019expectation,
   title={Expectation-maximization attention networks for semantic segmentation},
   author={Li, Xia and Zhong, Zhisheng and Wu, Jianlong and Yang, Yibo and Lin, Zhouchen and Liu, Hong},
@@ -32,8 +33,6 @@ Self-attention mechanism has been widely used for various tasks. It is designed
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/encnet/README.md b/configs/encnet/README.md
index f5925879e..64cfe1ab8 100644
--- a/configs/encnet/README.md
+++ b/configs/encnet/README.md
@@ -1,4 +1,6 @@
-# Context Encoding for Semantic Segmentation
+# EncNet
+
+[Context Encoding for Semantic Segmentation](https://arxiv.org/abs/1803.08904)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Recent work has made significant progress in improving spatial resolution for pi
 <img src="https://user-images.githubusercontent.com/24582831/142901276-b364fbbf-3bdb-4000-9d31-b9a135e30935.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1803.08904">EncNet (CVPR'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @InProceedings{Zhang_2018_CVPR,
 author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},
 title = {Context Encoding for Semantic Segmentation},
@@ -32,8 +33,6 @@ year = {2018}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/erfnet/README.md b/configs/erfnet/README.md
index 6d0811709..a0ccb6114 100644
--- a/configs/erfnet/README.md
+++ b/configs/erfnet/README.md
@@ -1,4 +1,6 @@
-# ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation
+# ERFNet
+
+[ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation](http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Semantic segmentation is a challenging task that addresses most of the perceptio
 <img src="https://user-images.githubusercontent.com/24582831/143479729-ea7951f6-1a3c-47d6-aaee-62c5759c0638.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf">ERFNet (T-ITS'2017)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{romera2017erfnet,
   title={Erfnet: Efficient residual factorized convnet for real-time semantic segmentation},
   author={Romera, Eduardo and Alvarez, Jos{\'e} M and Bergasa, Luis M and Arroyo, Roberto},
@@ -35,8 +36,6 @@ Semantic segmentation is a challenging task that addresses most of the perceptio
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/fastfcn/README.md b/configs/fastfcn/README.md
index f247fcf62..eea215847 100644
--- a/configs/fastfcn/README.md
+++ b/configs/fastfcn/README.md
@@ -1,4 +1,6 @@
-# FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation
+# FastFCN
+
+[FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation](https://arxiv.org/abs/1903.11816)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Modern approaches for semantic segmentation usually employ dilated convolutions
 <img src="https://user-images.githubusercontent.com/24582831/142901374-6e0252ab-6e0f-4acd-86ad-1e9f49be3185.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1903.11816">FastFCN (ArXiv'2019) </a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{wu2019fastfcn,
 title={Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation},
 author={Wu, Huikai and Zhang, Junge and Huang, Kaiqi and Liang, Kongming and Yu, Yizhou},
@@ -31,8 +32,6 @@ year={2019}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/fastscnn/README.md b/configs/fastscnn/README.md
index 3556546fe..570faff0e 100644
--- a/configs/fastscnn/README.md
+++ b/configs/fastscnn/README.md
@@ -1,4 +1,6 @@
-# Fast-SCNN for Semantic Segmentation
+# Fast-SCNN
+
+[Fast-SCNN for Semantic Segmentation](https://arxiv.org/abs/1902.04502)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ The encoder-decoder framework is state-of-the-art for offline semantic image seg
 <img src="https://user-images.githubusercontent.com/24582831/142901444-705b4ff4-6d1e-409b-899a-37bf3a6b69ce.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1902.04502">Fast-SCNN (ArXiv'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{poudel2019fast,
   title={Fast-scnn: Fast semantic segmentation network},
   author={Poudel, Rudra PK and Liwicki, Stephan and Cipolla, Roberto},
@@ -31,8 +32,6 @@ The encoder-decoder framework is state-of-the-art for offline semantic image seg
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/fcn/README.md b/configs/fcn/README.md
index ff52cc41c..8863e7a3a 100644
--- a/configs/fcn/README.md
+++ b/configs/fcn/README.md
@@ -1,4 +1,6 @@
-# Fully Convolutional Networks for Semantic Segmentation
+# FCN
+
+[Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Convolutional networks are powerful visual models that yield hierarchies of feat
 <img src="https://user-images.githubusercontent.com/24582831/142901525-fd0d2bf4-9a47-4143-85f5-3cee8849eaa4.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1411.4038">FCN (CVPR'2015/TPAMI'2017)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{shelhamer2017fully,
   title={Fully convolutional networks for semantic segmentation},
   author={Shelhamer, Evan and Long, Jonathan and Darrell, Trevor},
@@ -35,8 +36,6 @@ Convolutional networks are powerful visual models that yield hierarchies of feat
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/gcnet/README.md b/configs/gcnet/README.md
index b6a44b2bc..47f2f434e 100644
--- a/configs/gcnet/README.md
+++ b/configs/gcnet/README.md
@@ -1,4 +1,6 @@
-# GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond
+# GCNet
+
+[GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ The Non-Local Network (NLNet) presents a pioneering approach for capturing long-
 <img src="https://user-images.githubusercontent.com/24582831/142901601-ad17922e-2538-4b48-9f51-84a57d44b12b.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1904.11492">GCNet (ICCVW'2019/TPAMI'2020)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{cao2019gcnet,
   title={Gcnet: Non-local networks meet squeeze-excitation networks and beyond},
   author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han},
@@ -32,8 +33,6 @@ The Non-Local Network (NLNet) presents a pioneering approach for capturing long-
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md
index a6de1db2d..885ec19b1 100644
--- a/configs/hrnet/README.md
+++ b/configs/hrnet/README.md
@@ -1,8 +1,10 @@
-# Deep High-Resolution Representation Learning for Human Pose Estimation
+# HRNet
+
+[Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1908.07919)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/HRNet/HRNet-Semantic-Segmentation">Official Repo</a>
 
@@ -19,10 +21,9 @@ High-resolution representations are essential for position-sensitive vision prob
 <img src="https://user-images.githubusercontent.com/24582831/142901680-64c285bc-669f-4924-b054-46a2f07c5427.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1908.07919">HRNet (CVPR'2019)</a></summary>
+## Citation
 
-```latext
+```bibtext
 @inproceedings{SunXLW19,
   title={Deep High-Resolution Representation Learning for Human Pose Estimation},
   author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang},
@@ -31,8 +32,6 @@ High-resolution representations are essential for position-sensitive vision prob
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/icnet/README.md b/configs/icnet/README.md
index a1625008f..48e8b46aa 100644
--- a/configs/icnet/README.md
+++ b/configs/icnet/README.md
@@ -1,4 +1,6 @@
-# ICNet for Real-time Semantic Segmentation on High-resolution Images
+# ICNet
+
+[ICNet for Real-time Semantic Segmentation on High-resolution Images](https://arxiv.org/abs/1704.08545)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ We focus on the challenging task of real-time semantic segmentation in this pape
 <img src="https://user-images.githubusercontent.com/24582831/142901772-4570455d-7b27-44ae-a690-47dd9fde8445.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1704.08545">ICNet (ECCV'2018)</a></summary>
+## Citation
 
-```latext
+```bibtext
 @inproceedings{zhao2018icnet,
   title={Icnet for real-time semantic segmentation on high-resolution images},
   author={Zhao, Hengshuang and Qi, Xiaojuan and Shen, Xiaoyong and Shi, Jianping and Jia, Jiaya},
@@ -32,8 +33,6 @@ We focus on the challenging task of real-time semantic segmentation in this pape
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/isanet/README.md b/configs/isanet/README.md
index 7ffed5314..a120e2e3f 100644
--- a/configs/isanet/README.md
+++ b/configs/isanet/README.md
@@ -1,4 +1,6 @@
-# Interlaced Sparse Self-Attention for Semantic Segmentation
+# ISANet
+
+[Interlaced Sparse Self-Attention for Semantic Segmentation](https://arxiv.org/abs/1907.12273)
 
 ## Introduction
 
@@ -19,18 +21,20 @@ In this paper, we present a so-called interlaced sparse self-attention approach
 <img src="https://user-images.githubusercontent.com/24582831/142901868-03d80da4-b9c0-4df9-8509-5f684ba9dadc.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1907.12273">ISANet (ArXiv'2019/IJCV'2021)</a></summary>
+## Citation
 
-```
+```bibetex
 @article{huang2019isa,
   title={Interlaced Sparse Self-Attention for Semantic Segmentation},
   author={Huang, Lang and Yuan, Yuhui and Guo, Jianyuan and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
   journal={arXiv preprint arXiv:1907.12273},
   year={2019}
 }
+```
 
 The technical report above is also presented at:
+
+```bibetex
 @article{yuan2021ocnet,
   title={OCNet: Object Context for Semantic Segmentation},
   author={Yuan, Yuhui and Huang, Lang and Guo, Jianyuan and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
@@ -41,8 +45,6 @@ The technical report above is also presented at:
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/mobilenet_v2/README.md b/configs/mobilenet_v2/README.md
index 697a00611..bef889870 100644
--- a/configs/mobilenet_v2/README.md
+++ b/configs/mobilenet_v2/README.md
@@ -1,8 +1,10 @@
-# MobileNetV2: Inverted Residuals and Linear Bottlenecks
+# MobileNetV2
+
+[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
 
@@ -20,10 +22,9 @@ The MobileNetV2 architecture is based on an inverted residual structure where th
 <img src="https://user-images.githubusercontent.com/24582831/142901935-fa22700e-4b77-477f-90b9-334a4197506f.png" width="50%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1801.04381">MobileNetV2 (CVPR'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{sandler2018mobilenetv2,
   title={Mobilenetv2: Inverted residuals and linear bottlenecks},
   author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
@@ -33,8 +34,6 @@ The MobileNetV2 architecture is based on an inverted residual structure where th
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/mobilenet_v3/README.md b/configs/mobilenet_v3/README.md
index cb1940a83..b08ac2728 100644
--- a/configs/mobilenet_v3/README.md
+++ b/configs/mobilenet_v3/README.md
@@ -1,8 +1,10 @@
-# Searching for MobileNetV3
+# MobileNetV3
+
+[Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
 
@@ -19,10 +21,9 @@ We present the next generation of MobileNets based on a combination of complemen
 <img src="https://user-images.githubusercontent.com/24582831/142902036-3dc2e0c0-d475-4816-b1ac-961836b41f5c.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1905.02244">MobileNetV3 (ICCV'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{Howard_2019_ICCV,
   title={Searching for MobileNetV3},
   author={Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig},
@@ -34,8 +35,6 @@ We present the next generation of MobileNets based on a combination of complemen
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/nonlocal_net/README.md b/configs/nonlocal_net/README.md
index 0a67f6a97..787d87b84 100644
--- a/configs/nonlocal_net/README.md
+++ b/configs/nonlocal_net/README.md
@@ -1,4 +1,6 @@
-# Non-local Neural Networks
+# NonLocal Net
+
+[Non-local Neural Networks](https://arxiv.org/abs/1711.07971)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Both convolutional and recurrent operations are building blocks that process one
 <img src="https://user-images.githubusercontent.com/24582831/142902128-17e29678-bf12-4ff4-b3d6-a39b47dfd253.png" width="50%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1711.07971">NonLocal Net (CVPR'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{wang2018non,
   title={Non-local neural networks},
   author={Wang, Xiaolong and Girshick, Ross and Gupta, Abhinav and He, Kaiming},
@@ -32,8 +33,6 @@ Both convolutional and recurrent operations are building blocks that process one
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/ocrnet/README.md b/configs/ocrnet/README.md
index ea2eb4275..ef7312af0 100644
--- a/configs/ocrnet/README.md
+++ b/configs/ocrnet/README.md
@@ -1,4 +1,6 @@
-# Object-Contextual Representations for Semantic Segmentation
+# OCRNet
+
+[Object-Contextual Representations for Semantic Segmentation](https://arxiv.org/abs/1909.11065)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ In this paper, we address the problem of semantic segmentation and focus on the
 <img src="https://user-images.githubusercontent.com/24582831/142902197-b06b1e04-57ab-44ac-adc8-cea6695bb236.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1909.11065">OCRNet (ECCV'2020)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{YuanW18,
   title={Ocnet: Object context network for scene parsing},
   author={Yuhui Yuan and Jingdong Wang},
@@ -38,8 +39,6 @@ In this paper, we address the problem of semantic segmentation and focus on the
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/point_rend/README.md b/configs/point_rend/README.md
index 093047741..34448e36b 100644
--- a/configs/point_rend/README.md
+++ b/configs/point_rend/README.md
@@ -1,4 +1,6 @@
-# PointRend: Image Segmentation as Rendering
+# PointRend
+
+[PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ We present a new method for efficient high-quality image segmentation of objects
 <img src="https://user-images.githubusercontent.com/24582831/142902293-5db49cdd-4b1b-4940-9067-2acd6196c700.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1912.08193">PointRend (CVPR'2020)</a></summary>
+## Citation
 
-```
+```bibtex
 @inproceedings{kirillov2020pointrend,
   title={Pointrend: Image segmentation as rendering},
   author={Kirillov, Alexander and Wu, Yuxin and He, Kaiming and Girshick, Ross},
@@ -32,8 +33,6 @@ We present a new method for efficient high-quality image segmentation of objects
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/psanet/README.md b/configs/psanet/README.md
index 06126166e..fede7d439 100644
--- a/configs/psanet/README.md
+++ b/configs/psanet/README.md
@@ -1,4 +1,6 @@
-# PSANet: Point-wise Spatial Attention Network for Scene Parsing
+# PSANet
+
+[PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ We notice information flow in convolutional neural networksis  restricted  insid
 <img src="https://user-images.githubusercontent.com/24582831/142902367-0f29e8cb-5ac0-434b-98c4-b2af7c9c2e58.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf">PSANet (ECCV'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{zhao2018psanet,
   title={Psanet: Point-wise spatial attention network for scene parsing},
   author={Zhao, Hengshuang and Zhang, Yi and Liu, Shu and Shi, Jianping and Change Loy, Chen and Lin, Dahua and Jia, Jiaya},
@@ -32,8 +33,6 @@ We notice information flow in convolutional neural networksis  restricted  insid
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/pspnet/README.md b/configs/pspnet/README.md
index 25f021bbb..ca8bddabb 100644
--- a/configs/pspnet/README.md
+++ b/configs/pspnet/README.md
@@ -1,4 +1,6 @@
-# Pyramid Scene Parsing Network
+# PSPNet
+
+[Pyramid Scene Parsing Network](https://arxiv.org/abs/1612.01105)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Scene parsing is challenging for unrestricted open vocabulary and diverse scenes
 <img src="https://user-images.githubusercontent.com/24582831/142902444-9f93b99e-9261-443b-a0a4-17e78eefb525.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1612.01105">PSPNet (CVPR'2017)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{zhao2017pspnet,
   title={Pyramid Scene Parsing Network},
   author={Zhao, Hengshuang and Shi, Jianping and Qi, Xiaojuan and Wang, Xiaogang and Jia, Jiaya},
@@ -31,8 +32,6 @@ Scene parsing is challenging for unrestricted open vocabulary and diverse scenes
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/resnest/README.md b/configs/resnest/README.md
index 7b29a9b4f..fbabf98e3 100644
--- a/configs/resnest/README.md
+++ b/configs/resnest/README.md
@@ -1,8 +1,10 @@
-# ResNeSt: Split-Attention Networks
+# ResNeSt
+
+[ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/zhanghang1989/ResNeSt">Official Repo</a>
 
@@ -19,10 +21,9 @@ It is well known that featuremap attention and multi-path representation are imp
 <img src="https://user-images.githubusercontent.com/24582831/142902526-3cf33345-7e40-47a6-985e-4381857e21df.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2004.08955">ResNeSt (ArXiv'2020)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{zhang2020resnest,
 title={ResNeSt: Split-Attention Networks},
 author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
@@ -31,8 +32,6 @@ year={2020}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/segformer/README.md b/configs/segformer/README.md
index 560696072..790c0f519 100644
--- a/configs/segformer/README.md
+++ b/configs/segformer/README.md
@@ -1,4 +1,6 @@
-# SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
+# SegFormer
+
+[SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ We present SegFormer, a simple, efficient yet powerful semantic segmentation fra
 <img src="https://user-images.githubusercontent.com/24582831/142902600-e188073e-5744-4ba9-8dbf-9316e55c74aa.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2105.15203">SegFormer (ArXiv'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{xie2021segformer,
   title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
   author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
@@ -31,8 +32,6 @@ We present SegFormer, a simple, efficient yet powerful semantic segmentation fra
 }
 ```
 
-</details>
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
diff --git a/configs/segformer/segformer.yml b/configs/segformer/segformer.yml
index 4d9453257..c4efc582a 100644
--- a/configs/segformer/segformer.yml
+++ b/configs/segformer/segformer.yml
@@ -6,7 +6,8 @@ Collections:
     - Cityscapes
   Paper:
     URL: https://arxiv.org/abs/2105.15203
-    Title: resize image to multiple of 32, improve SegFormer by 0.5-1.0 mIoU.
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
   README: configs/segformer/README.md
   Code:
     URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
diff --git a/configs/sem_fpn/README.md b/configs/sem_fpn/README.md
index e7e87132a..a3732fdfd 100644
--- a/configs/sem_fpn/README.md
+++ b/configs/sem_fpn/README.md
@@ -1,4 +1,6 @@
-# Panoptic Feature Pyramid Networks
+# Semantic FPN
+
+[Panoptic Feature Pyramid Networks](https://arxiv.org/abs/1901.02446)
 
 ## Introduction
 
@@ -19,25 +21,18 @@ The recently introduced panoptic segmentation task has renewed our community's i
 <img src="https://user-images.githubusercontent.com/24582831/142902694-03ed2131-9104-467b-ace1-c74c62fb7177.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1901.02446">Semantic FPN (CVPR'2019)</a></summary>
+## Citation
 
-```latex
-@article{Kirillov_2019,
-   title={Panoptic Feature Pyramid Networks},
-   ISBN={9781728132938},
-   url={http://dx.doi.org/10.1109/CVPR.2019.00656},
-   DOI={10.1109/cvpr.2019.00656},
-   journal={2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
-   publisher={IEEE},
-   author={Kirillov, Alexander and Girshick, Ross and He, Kaiming and Dollar, Piotr},
-   year={2019},
-   month={Jun}
+```bibtex
+@inproceedings{kirillov2019panoptic,
+  title={Panoptic feature pyramid networks},
+  author={Kirillov, Alexander and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={6399--6408},
+  year={2019}
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/setr/README.md b/configs/setr/README.md
index d1eb0260f..5673d9b63 100644
--- a/configs/setr/README.md
+++ b/configs/setr/README.md
@@ -1,4 +1,6 @@
-# Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers
+# SETR
+
+[Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https://arxiv.org/abs/2012.15840)
 
 ## Introduction
 
@@ -23,10 +25,9 @@ Most recent semantic segmentation methods adopt a fully-convolutional network (F
 This head has two version head.
 ```
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2012.15840">SETR (CVPR'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{zheng2020rethinking,
   title={Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers},
   author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
@@ -35,8 +36,6 @@ This head has two version head.
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### ADE20K
diff --git a/configs/stdc/README.md b/configs/stdc/README.md
index 11fe3d677..4fece6fe3 100644
--- a/configs/stdc/README.md
+++ b/configs/stdc/README.md
@@ -1,4 +1,6 @@
-# Rethinking BiSeNet For Real-time Semantic Segmentation
+# STDC
+
+[Rethinking BiSeNet For Real-time Semantic Segmentation](https://arxiv.org/abs/2104.13188)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ BiSeNet has been proved to be a popular two-stream network for real-time segment
 <img src="https://user-images.githubusercontent.com/24582831/143640374-d0709587-edb2-4821-bb60-340035f6ad8f.png" width="60%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2104.13188">STDC (CVPR'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{fan2021rethinking,
   title={Rethinking BiSeNet For Real-time Semantic Segmentation},
   author={Fan, Mingyuan and Lai, Shenqi and Huang, Junshi and Wei, Xiaoming and Chai, Zhenhua and Luo, Junfeng and Wei, Xiaolin},
@@ -32,8 +33,6 @@ BiSeNet has been proved to be a popular two-stream network for real-time segment
 }
 ```
 
-</details>
-
 ## Usage
 
 To use original repositories' [ImageNet Pretrained STDCNet Weights](https://drive.google.com/drive/folders/1wROFwRt8qWHD4jSo8Zu1gp1d6oYJ3ns1) , it is necessary to convert keys.
diff --git a/configs/swin/README.md b/configs/swin/README.md
index 422133153..ac4bd56e7 100644
--- a/configs/swin/README.md
+++ b/configs/swin/README.md
@@ -1,8 +1,10 @@
-# Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+# Swin Transformer
+
+[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/microsoft/Swin-Transformer">Official Repo</a>
 
@@ -19,10 +21,9 @@ This paper presents a new vision Transformer, called Swin Transformer, that capa
 <img src="https://user-images.githubusercontent.com/24582831/142902882-3fb9014c-11b6-47e9-aa14-500dfe7cbb1c.png" width="80%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/2103.14030">Swin Transformer (arXiv'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{liu2021Swin,
   title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
   author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
@@ -31,8 +32,6 @@ This paper presents a new vision Transformer, called Swin Transformer, that capa
 }
 ```
 
-</details>
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
diff --git a/configs/twins/README.md b/configs/twins/README.md
index 0dbb41e6f..0ecb79d01 100644
--- a/configs/twins/README.md
+++ b/configs/twins/README.md
@@ -1,8 +1,10 @@
-# Twins: Revisiting the Design of Spatial Attention in Vision Transformers
+# Twins
+
+[Twins: Revisiting the Design of Spatial Attention in Vision Transformers](https://arxiv.org/pdf/2104.13840.pdf)
 
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href = "https://github.com/Meituan-AutoML/Twins">Official Repo</a>
 
@@ -19,10 +21,9 @@ Very recently, a variety of vision transformer architectures for dense predictio
 <img src="https://user-images.githubusercontent.com/24582831/145021310-57826cf5-5e03-4c7c-9081-ffa744bdae27.png" width="80%"/>
 </div>
 
-<details>
-<summary align = "right"> <a href = "https://arxiv.org/pdf/2104.13840.pdf" >Twins (NeurIPS'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{chu2021twins,
   title={Twins: Revisiting spatial attention design in vision transformers},
   author={Chu, Xiangxiang and Tian, Zhi and Wang, Yuqing and Zhang, Bo and Ren, Haibing and Wei, Xiaolin and Xia, Huaxia and Shen, Chunhua},
@@ -31,8 +32,6 @@ Very recently, a variety of vision transformer architectures for dense predictio
 }
 ```
 
-</details>
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
diff --git a/configs/unet/README.md b/configs/unet/README.md
index 727ecf5e3..96bc98d7a 100644
--- a/configs/unet/README.md
+++ b/configs/unet/README.md
@@ -1,4 +1,6 @@
-# U-Net: Convolutional Networks for Biomedical Image Segmentation
+# UNet
+
+[U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ There is large consent that successful training of deep networks requires many t
 <img src="https://user-images.githubusercontent.com/24582831/142902977-20fe689d-a147-4d92-9690-dbfde8b68dbe.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/abs/1505.04597">UNet (MICCAI'2016/Nat. Methods'2019)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{ronneberger2015u,
   title={U-net: Convolutional networks for biomedical image segmentation},
   author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
@@ -33,8 +34,6 @@ There is large consent that successful training of deep networks requires many t
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/upernet/README.md b/configs/upernet/README.md
index 50e788e12..0ab3cb3d3 100644
--- a/configs/upernet/README.md
+++ b/configs/upernet/README.md
@@ -1,4 +1,6 @@
-# Unified Perceptual Parsing for Scene Understanding
+# UPerNet
+
+[Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/pdf/1807.10221.pdf)
 
 ## Introduction
 
@@ -19,10 +21,9 @@ Humans recognize the visual world at multiple levels: we effortlessly categorize
 <img src="https://user-images.githubusercontent.com/24582831/142903077-44e8e0da-7276-4bda-bd2b-0df1680ca845.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/pdf/1807.10221.pdf">UPerNet (ECCV'2018)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @inproceedings{xiao2018unified,
   title={Unified perceptual parsing for scene understanding},
   author={Xiao, Tete and Liu, Yingcheng and Zhou, Bolei and Jiang, Yuning and Sun, Jian},
@@ -32,8 +33,6 @@ Humans recognize the visual world at multiple levels: we effortlessly categorize
 }
 ```
 
-</details>
-
 ## Results and models
 
 ### Cityscapes
diff --git a/configs/vit/README.md b/configs/vit/README.md
index ac06f5169..eec65b52e 100644
--- a/configs/vit/README.md
+++ b/configs/vit/README.md
@@ -1,8 +1,10 @@
 # Vision Transformer
 
+[An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
+
 ## Introduction
 
-<!-- [ALGORITHM] -->
+<!-- [BACKBONE] -->
 
 <a href="https://github.com/google-research/vision_transformer">Official Repo</a>
 
@@ -19,10 +21,9 @@ While the Transformer architecture has become the de-facto standard for natural
 <img src="https://user-images.githubusercontent.com/24582831/142903144-f80a12cc-8698-48ab-843c-49dedf558121.png" width="70%"/>
 </div>
 
-<details>
-<summary align="right"><a href="https://arxiv.org/pdf/2010.11929.pdf">Vision Transformer (ICLR'2021)</a></summary>
+## Citation
 
-```latex
+```bibtex
 @article{dosoViTskiy2020,
   title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
   author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
@@ -31,8 +32,6 @@ While the Transformer architecture has become the de-facto standard for natural
 }
 ```
 
-</details>
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
diff --git a/configs/vit/vit.yml b/configs/vit/vit.yml
index 9d6449b0a..9e3b02e5a 100644
--- a/configs/vit/vit.yml
+++ b/configs/vit/vit.yml
@@ -5,7 +5,7 @@ Collections:
     - ADE20K
   Paper:
     URL: https://arxiv.org/pdf/2010.11929.pdf
-    Title: Vision Transformer
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
   README: configs/vit/README.md
   Code:
     URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98