diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c55af0f2..1f1e9ad5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,9 +29,9 @@ repos:
     rev: 0.7.9
     hooks:
       - id: mdformat
-        args: ["--number", "--table-width", "200"]
+        args: ["--number", "--table-width", "200", '--disable-escape', 'backslash']
         additional_dependencies:
-          - mdformat-openmmlab
+          - "mdformat-openmmlab>=0.0.4"
           - mdformat_frontmatter
           - linkify-it-py
   - repo: https://github.com/codespell-project/codespell
diff --git a/configs/resnet/README.md b/configs/resnet/README.md
index 69f128df..266a71e8 100644
--- a/configs/resnet/README.md
+++ b/configs/resnet/README.md
@@ -4,16 +4,94 @@
 
 <!-- [ALGORITHM] -->
 
-## Abstract
+## Introduction
 
-Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.
+**Residual Networks**, or **ResNets**, learn residual functions with reference to the layer inputs, instead of
+learning unreferenced functions. In the mainstream previous works, like VGG, the neural networks are a stack
+of layers and every layer attempts to fit a desired underlying mapping. In ResNets, a few stacked layers are
+grouped as a block, and the layers in a block attempts to learn a residual mapping.
 
-The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.
+Formally, denoting the desired underlying mapping of a block as $\mathcal{H}(x)$, split the underlying mapping
+into the sum of the identity and the residual mapping as $\mathcal{H}(x) = x + \mathcal{F}(x)$, and let the
+stacked non-linear layers fit the residual mapping $\mathcal{F}(x)$.
+
+Many works proved this method makes deep neural networks easier to optimize, and can gain accuracy from
+considerably increased depth. Recently, the residual structure is widely used in various models.
 
 <div align=center>
 <img src="https://user-images.githubusercontent.com/26739999/142574068-60cfdeea-c4ec-4c49-abb2-5dc2facafc3b.png" width="40%"/>
 </div>
 
+## Abstract
+
+<details>
+
+<summary>Show the paper's abstract</summary>
+
+<br>
+Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.
+
+The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.
+</br>
+
+</details>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+>>> import torch
+>>> from mmcls.apis import init_model, inference_model
+>>>
+>>> model = init_model('configs/resnet/resnet50_8xb32_in1k.py', 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth')
+>>> predict = inference_model(model, 'demo/demo.JPEG')
+>>> print(predict['pred_class'])
+sea snake
+>>> print(predict['pred_score'])
+0.6649363040924072
+```
+
+**Use the model**
+
+```python
+>>> import torch
+>>> from mmcls.apis import init_model
+>>>
+>>> model = init_model('configs/resnet/resnet50_8xb32_in1k.py', 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth')
+>>> inputs = torch.rand(1, 3, 224, 224).to(model.data_preprocessor.device)
+>>> # To get classification scores.
+>>> out = model(inputs)
+>>> print(out.shape)
+torch.Size([1, 1000])
+>>> # To extract features.
+>>> outs = model.extract_feat(inputs)
+>>> print(outs[0].shape)
+torch.Size([1, 2048])
+```
+
+**Train/Test Command**
+
+Place the ImageNet dataset to the `data/imagenet/` directory, or prepare datasets according to the [docs](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/resnet/resnet50_8xb32_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/resnet/resnet50_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_b16x8_cifar10_20210528-f54bfad9.pth
+```
+
+<!-- [TABS-END] -->
+
+For more configurable parameters, please refer to the [API](https://mmclassification.readthedocs.io/en/1.x/api/generated/mmcls.models.backbones.ResNet.html#mmcls.models.backbones.ResNet).
+
 ## Results and models
 
 The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don't have evaluation results.
diff --git a/configs/t2t_vit/README.md b/configs/t2t_vit/README.md
index ad108dc0..48e0f194 100644
--- a/configs/t2t_vit/README.md
+++ b/configs/t2t_vit/README.md
@@ -6,7 +6,7 @@
 
 ## Abstract
 
-Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, \\eg, the Vision Transformer (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance to CNNs when trained from scratch on a midsize dataset like ImageNet. We find it is because: 1) the simple tokenization of input images fails to model the important local structure such as edges and lines among neighboring pixels, leading to low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness for fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformer (T2T-ViT), which incorporates 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure represented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformer motivated by CNN architecture design after empirical study. Notably, T2T-ViT reduces the parameter count and MACs of vanilla ViT by half, while achieving more than 3.0% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets by directly training on ImageNet. For example, T2T-ViT with comparable size to ResNet50 (21.5M parameters) can achieve 83.3% top1 accuracy in image resolution 384×384 on ImageNet.
+Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, e.g., the Vision Transformer (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance to CNNs when trained from scratch on a midsize dataset like ImageNet. We find it is because: 1) the simple tokenization of input images fails to model the important local structure such as edges and lines among neighboring pixels, leading to low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness for fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformer (T2T-ViT), which incorporates 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure represented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformer motivated by CNN architecture design after empirical study. Notably, T2T-ViT reduces the parameter count and MACs of vanilla ViT by half, while achieving more than 3.0% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets by directly training on ImageNet. For example, T2T-ViT with comparable size to ResNet50 (21.5M parameters) can achieve 83.3% top1 accuracy in image resolution 384×384 on ImageNet.
 
 <div align=center>
 <img src="https://user-images.githubusercontent.com/26739999/142578381-e9040610-05d9-457c-8bf5-01c2fa94add2.png" width="60%"/>
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 5cc79773..8117f69e 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -114,7 +114,10 @@ html_theme_options = {
     ],
     # Specify the language of shared menu
     'menu_lang':
-    'en'
+    'en',
+    # Disable the default edit on GitHub
+    'default_edit_on_github':
+    False,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -217,7 +220,7 @@ copybutton_prompt_is_regexp = True
 # Auto-generated header anchors
 myst_heading_anchors = 3
 # Enable "colon_fence" extension of myst.
-myst_enable_extensions = ['colon_fence']
+myst_enable_extensions = ['colon_fence', 'dollarmath']
 
 # Configuration for intersphinx
 intersphinx_mapping = {
diff --git a/docs/en/stat.py b/docs/en/stat.py
index dcdbcb88..832008c3 100755
--- a/docs/en/stat.py
+++ b/docs/en/stat.py
@@ -97,6 +97,7 @@ def generate_paper_page(collection):
         return f'[{name}]({link})'
 
     content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, readme)
+    content = f'---\ngithub_page: /{collection.readme}\n---\n' + content
 
     def make_tabs(matchobj):
         """modify the format from emphasis black symbol to tabs."""
diff --git a/docs/zh_CN/conf.py b/docs/zh_CN/conf.py
index aecdde5c..76c13565 100644
--- a/docs/zh_CN/conf.py
+++ b/docs/zh_CN/conf.py
@@ -115,6 +115,9 @@ html_theme_options = {
     # Specify the language of shared menu
     'menu_lang':
     'cn',
+    # Disable the default edit on GitHub
+    'default_edit_on_github':
+    False,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -204,7 +207,7 @@ copybutton_prompt_is_regexp = True
 # Auto-generated header anchors
 myst_heading_anchors = 3
 # Enable "colon_fence" extension of myst.
-myst_enable_extensions = ['colon_fence']
+myst_enable_extensions = ['colon_fence', 'dollarmath']
 
 # Configuration for intersphinx
 intersphinx_mapping = {
diff --git a/docs/zh_CN/stat.py b/docs/zh_CN/stat.py
index bd6d8b4c..8ffb9bfe 100755
--- a/docs/zh_CN/stat.py
+++ b/docs/zh_CN/stat.py
@@ -97,6 +97,7 @@ def generate_paper_page(collection):
         return f'[{name}]({link})'
 
     content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, readme)
+    content = f'---\ngithub_page: /{collection.readme}\n---\n' + content
 
     def make_tabs(matchobj):
         """modify the format from emphasis black symbol to tabs."""
diff --git a/mmcls/apis/inference.py b/mmcls/apis/inference.py
index 60fda25e..2ef0fa56 100644
--- a/mmcls/apis/inference.py
+++ b/mmcls/apis/inference.py
@@ -67,6 +67,7 @@ def inference_model(model, img):
         result (dict): The classification results that contains
             `class_name`, `pred_label` and `pred_score`.
     """
+    register_all_modules()
     cfg = model.cfg
     # build the data pipeline
     test_pipeline_cfg = cfg.test_dataloader.dataset.pipeline
diff --git a/requirements/optional.txt b/requirements/optional.txt
index 9eac3252..9657bfe1 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1,4 +1,3 @@
 albumentations>=0.3.2 --no-binary qudida,albumentations
 colorama
-fvcore
 requests
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
index 5e41ed9b..2c50d18f 100644
--- a/tools/analysis_tools/get_flops.py
+++ b/tools/analysis_tools/get_flops.py
@@ -8,8 +8,7 @@ try:
                            flop_count_str, flop_count_table, parameter_count)
 except ImportError:
     print('You may need to install fvcore for flops computation, '
-          'and you can use `pip install -r requirements/optional.txt` '
-          'to set up the environment')
+          'and you can use `pip install fvcore` to set up the environment')
 from fvcore.nn.print_model_statistics import _format_size
 from mmengine import Config