Add pp formulanet (#14429)

* add ppformulanet * rename loss * modify doc * add export code * modify yaml for global ref
2024-12-23 13:14:33 +08:00 · 2024-12-23 13:14:33 +08:00 · d523388ed1
parent 0697d248f8
commit d523388ed1
21 changed files with 3863 additions and 37 deletions
--- a/configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml
+++ b/configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml
@ -0,0 +1,117 @@
 Global:
  use_gpu: True
  epoch_num: 10
  log_smooth_window: 10
  print_batch_step: 10
  save_model_dir: ./output/rec/pp_formulanet_l/
  save_epoch_step: 2
  # evaluation is run every  417  iterations (1 epoch)(batch_size = 24)   # max_seq_len: 1024
  eval_batch_step: [0,  417 ]
  cal_metric_during_train: True
  pretrained_model:
  checkpoints:
  save_inference_dir:
  use_visualdl: False
  infer_img: doc/datasets/pme_demo/0000013.png
  infer_mode: False
  use_space_char: False
  rec_char_dict_path: &rec_char_dict_path ppocr/utils/dict/unimernet_tokenizer
  max_new_tokens: &max_new_tokens 1024
  input_size: &input_size [768, 768]
  save_res_path: ./output/rec/predicts_unimernet_latexocr.txt
  allow_resize_largeImg: False
  start_ema: True
 Optimizer:
  name: AdamW
  beta1: 0.9
  beta2: 0.999
  weight_decay: 0.05
  lr:
    name: LinearWarmupCosine
    learning_rate: 0.0001
 Architecture:
  model_type: rec
  algorithm: PP-FormulaNet-L
  in_channels: 3
  Transform:
  Backbone:
    name: Vary_VIT_B_Formula
    image_size: 768 
    encoder_embed_dim: 768
    encoder_depth: 12
    encoder_num_heads: 12
    encoder_global_attn_indexes: [2, 5, 8, 11]
  Head:
    name: PPFormulaNet_Head
    max_new_tokens: *max_new_tokens
    decoder_start_token_id: 0
    decoder_ffn_dim: 2048
    decoder_hidden_size: 512
    decoder_layers: 8
    temperature: 0.2
    do_sample: False
    top_p: 0.95 
    encoder_hidden_size: 1024
    is_export: False
    length_aware: False 
    use_parallel: False
    parallel_step: 0
 Loss:
  name: PPFormulaNet_L_Loss
 PostProcess:
  name:  UniMERNetDecode
  rec_char_dict_path:  *rec_char_dict_path
 Metric:
  name: LaTeXOCRMetric
  main_indicator:  exp_rate
  cal_blue_score: False
 Train:
  dataset:
    name: SimpleDataSet
    data_dir: ./ocr_rec_latexocr_dataset_example
    label_file_list: ["./ocr_rec_latexocr_dataset_example/train.txt"]
    transforms:
      - UniMERNetImgDecode:
          input_size: *input_size
      - UniMERNetTrainTransform: 
      - LatexImageFormat:
      - UniMERNetLabelEncode:
          rec_char_dict_path: *rec_char_dict_path
          max_seq_len:  *max_new_tokens
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask']
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 6
    num_workers: 0
    collate_fn: UniMERNetCollator
 Eval:
  dataset:
    name: SimpleDataSet
    data_dir: ./ocr_rec_latexocr_dataset_example
    label_file_list: ["./ocr_rec_latexocr_dataset_example/val.txt"]
    transforms:
      - UniMERNetImgDecode:
          input_size: *input_size
      - UniMERNetTestTransform:
      - LatexImageFormat:
      - UniMERNetLabelEncode:
          max_seq_len:  *max_new_tokens
          rec_char_dict_path: *rec_char_dict_path
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask', 'filename']
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 10
    num_workers: 0
    collate_fn: UniMERNetCollator
--- a/configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml
+++ b/configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml
@ -0,0 +1,115 @@
 Global:
  use_gpu: True
  epoch_num: 20
  log_smooth_window: 10
  print_batch_step: 10
  save_model_dir: ./output/rec/pp_formulanet_s/
  save_epoch_step: 2
  # evaluation is run every 179 iterations (1 epoch)(batch_size = 56)   # max_seq_len: 1024
  eval_batch_step: [0, 179]
  cal_metric_during_train: True
  pretrained_model:
  checkpoints:
  save_inference_dir:
  use_visualdl: False
  infer_img: doc/datasets/pme_demo/0000013.png
  infer_mode: False
  use_space_char: False
  rec_char_dict_path: &rec_char_dict_path  ppocr/utils/dict/unimernet_tokenizer
  max_new_tokens: &max_new_tokens 1024
  input_size: &input_size [384, 384]
  save_res_path: ./output/rec/predicts_unimernet_latexocr.txt
  allow_resize_largeImg: False
  start_ema: True
 Optimizer:
  name: AdamW
  beta1: 0.9
  beta2: 0.999
  weight_decay: 0.05
  lr:
    name: LinearWarmupCosine
    learning_rate: 0.0001
 Architecture:
  model_type: rec
  algorithm: PP-FormulaNet-S
  in_channels: 3
  Transform:
  Backbone:
    name: PPHGNetV2_B4
    class_num: 1024
  Head:
    name: PPFormulaNet_Head
    max_new_tokens:  *max_new_tokens
    decoder_start_token_id: 0
    decoder_ffn_dim: 1536
    decoder_hidden_size: 384
    decoder_layers: 2
    temperature: 0.2
    do_sample: False
    top_p: 0.95 
    encoder_hidden_size: 2048
    is_export: False
    length_aware: True 
    use_parallel: True,
    parallel_step: 3
 Loss:
  name: PPFormulaNet_S_Loss
  parallel_step: 3
 PostProcess:
  name:  UniMERNetDecode
  rec_char_dict_path: *rec_char_dict_path
 Metric:
  name: LaTeXOCRMetric
  main_indicator:  exp_rate
  cal_blue_score: False
 Train:
  dataset:
    name: SimpleDataSet
    data_dir: ./ocr_rec_latexocr_dataset_example
    label_file_list: ["./ocr_rec_latexocr_dataset_example/train.txt"]
    transforms:
      - UniMERNetImgDecode:
          input_size: *input_size
      - UniMERNetTrainTransform: 
      - LatexImageFormat:
      - UniMERNetLabelEncode:
          rec_char_dict_path: *rec_char_dict_path
          max_seq_len: *max_new_tokens
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask']
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 14
    num_workers: 0
    collate_fn: UniMERNetCollator
 Eval:
  dataset:
    name: SimpleDataSet
    data_dir: ./ocr_rec_latexocr_dataset_example
    label_file_list: ["./ocr_rec_latexocr_dataset_example/val.txt"]
    transforms:
      - UniMERNetImgDecode:
          input_size:  *input_size
      - UniMERNetTestTransform:
      - LatexImageFormat:
      - UniMERNetLabelEncode:
          max_seq_len: *max_new_tokens
          rec_char_dict_path: *rec_char_dict_path
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask', 'filename']
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 30
    num_workers: 0
    collate_fn: UniMERNetCollator
--- a/configs/rec/rec_unimernet.yml
+++ b/configs/rec/rec_unimernet.yml
@ -15,7 +15,9 @@ Global:
  infer_img: doc/datasets/pme_demo/0000013.png
  infer_mode: False
  use_space_char: False
-  rec_char_dict_path:  ppocr/utils/dict/unimernet_tokenizer
+  rec_char_dict_path: &rec_char_dict_path ppocr/utils/dict/unimernet_tokenizer
  input_size: &input_size [192, 672]
  max_seq_len: &max_seq_len 1024
  save_res_path: ./output/rec/predicts_unimernet_plus_config_latexocr.txt
  allow_resize_largeImg: False
@ -59,7 +61,7 @@ Loss:
 PostProcess:
  name:  UniMERNetDecode
-  rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
+  rec_char_dict_path: *rec_char_dict_path
 Metric:
  name: LaTeXOCRMetric
@ -73,12 +75,12 @@ Train:
    label_file_list: ["./train_data/UniMERNet/train_unimernet_1M.txt"]
    transforms:
      - UniMERNetImgDecode:
-          input_size: [192, 672]
+          input_size: *input_size
      - UniMERNetTrainTransform: 
      - UniMERNetImageFormat:
      - UniMERNetLabelEncode:
-          rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
+          rec_char_dict_path: *rec_char_dict_path
-          max_seq_len: 1024
+          max_seq_len: *max_seq_len
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask']
  loader:
@ -95,12 +97,12 @@ Eval:
    label_file_list: ["./train_data/UniMERNet/test_unimernet_cpe.txt"]
    transforms:
      - UniMERNetImgDecode:
-          input_size: [192, 672]
+          input_size: *input_size
      - UniMERNetTestTransform:
      - UniMERNetImageFormat:
      - UniMERNetLabelEncode:
-          max_seq_len: 1024
+          max_seq_len: *max_seq_len
-          rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
+          rec_char_dict_path: *rec_char_dict_path
      - KeepKeys:
          keep_keys: ['image', 'label', 'attention_mask']
  loader:
--- a/docs/algorithm/formula_recognition/algorithm_rec_ppformulanet.md
+++ b/docs/algorithm/formula_recognition/algorithm_rec_ppformulanet.md
@ -0,0 +1,94 @@
 # 印刷数学公式识别算法-PP-FormulaNet
 ## 1. 算法简介
 `PP-FormulaNet` 是百度飞桨自研的公式识别模型，采用 PaddleX 内部自建的 5百万数据集进行训练，在对应测试集上的精度如下：
 | 模型        | 骨干网络       | 配置文件                                                  | SPE-<br/>BLEU↑ | CPE-<br/>BLEU↑  | Easy-<br/>BLEU↑ | Middle-<br/>BLEU↑ | Hard-<br/>BLEU↑| Avg-<br/>BLEU↑ | 下载链接 |
 |-----------|------------|------------------|:--------------:|:---------:|:----------:|:----------------:|:---------:|:-----------------:|:-----------------:|
 | UniMERNet | Donut Swin | [rec_unimernet.yml](../../../configs/rec/rec_unimernet.yml) |     0.9187  |    0.9252       | 0.8658  |    0.8228   | 0.7740 |     0.8613        |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_unimernet_train.tar)|
 | PP-FormulaNet-S | PPHGNetV2_B4 | [rec_pp_formulanet_s.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml) |    0.8694   |    0.8071       | 0.9294  |    0.9112    | 0.8391 |    0.8712       |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar)|
 | PP-FormulaNet-L | Vary_VIT_B | [rec_pp_formulanet_l.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml) |     0.9055   |     0.9206       | 0.9392  |     0.9273    | 0.9141 |     0.9213         |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_l_train.tar )|
 其中，SPE、CPE为UniMERNet的简单公式数据集和复杂公式数据集；Easy、Middle、Hard为PaddleX内部自建的简单公式数据集（LaTeX 代码长度 0-64）、中等公式数据集（LaTeX 代码长度  64-256）和复杂公式数据集（LaTeX 代码长度  256+）。
 ## 2. 环境配置
 请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境，参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。
 此外，需要安装额外的依赖：
 ```shell
 sudo apt-get update
 sudo apt-get install libmagickwand-dev
 pip install -r docs/algorithm/formula_recognition/requirements.txt
 ```
 ## 3. 模型训练、评估、预测
 ### 3.1 准备数据集
 ```shell
 # 下载 PaddleX 官方示例数据集
 wget https://paddle-model-ecology.bj.bcebos.com/paddlex/data/ocr_rec_latexocr_dataset_example.tar
 tar -xf ocr_rec_latexocr_dataset_example.tar
 ```
 ### 3.2 下载预训练模型
 ```shell
 # 下载 PP-FormulaNet-S 预训练模型
 wget https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar 
 tar -xf rec_ppformulanet_s_train.tar
 ```
 ### 3.3 模型训练
 请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化，训练 `PP-FormulaNet-S` 识别模型时需要**更换配置文件**为 `PP-FormulaNet-S` 的[配置文件](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml)。
 #### 启动训练
 具体地，在完成数据准备后，便可以启动训练，训练命令如下：
 ```shell
 #单卡训练 (默认训练方式)
 python3 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
   -o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 #多卡训练，通过--gpus参数指定卡号
 python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1   tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
        -o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 **注意：**
 - 默认每训练 1个epoch（179 次iteration）进行1次评估，若您更改训练的batch_size，或更换数据集，请在训练时作出如下修改
 ```
 python3  -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1   tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
  -o Global.eval_batch_step=[0,{length_of_dataset//batch_size//4}] \
   Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 ### 3.4 评估
 可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar )，使用如下命令进行评估：
 ```shell
 # 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型，请注意修改路径和文件名为{path/to/weights}/{model_name}。
 # demo 测试集评估
 python3 tools/eval.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml -o \
 Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 ### 3.5 预测
 使用如下命令进行单张图片预测：
 ```shell
 # 注意将pretrained_model的路径设置为本地路径。
 python3 tools/infer_rec.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
  -o  Global.infer_img='./docs/datasets/images/pme_demo/0000099.png'\
   Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 # 预测文件夹下所有图像时，可修改infer_img为文件夹，如 Global.infer_img='./doc/datasets/pme_demo/'。
 ```
 ## 4. FAQ
--- a/docs/algorithm/formula_recognition/algorithm_rec_ppformulanet_en.md
+++ b/docs/algorithm/formula_recognition/algorithm_rec_ppformulanet_en.md
@ -0,0 +1,78 @@
 # PP-FormulaNet
 ## 1. Introduction
 PP-FormulaNet is a formula recognition model independently developed by Baidu PaddlePaddle. It is trained on a self-built dataset of 5 million samples within PaddleX, achieving the following accuracy on the corresponding test set:
 | Model           | Backbone       | config                                                  |SPE-<br/>BLEU↑ | CPE-<br/>BLEU↑  | Easy-<br/>BLEU↑ | Middle-<br/>BLEU↑ | Hard-<br/>BLEU↑| Avg-<br/>BLEU↑  | Download link |
 |-----------|--------|---------------------------------------------------|:--------------:|:-----------------:|:----------:|:----------------:|:---------:|:-----------------:|:--------------:|
 | UniMERNet | Donut Swin | [rec_unimernet.yml](../../../configs/rec/rec_unimernet.yml) |     0.9187  |    0.9252       | 0.8658  |    0.8228   | 0.7740 |     0.8613        |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_unimernet_train.tar)|
 | PP-FormulaNet-S | PPHGNetV2_B4 | [rec_pp_formulanet_s.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml) |    0.8694   |    0.8071       | 0.9294  |    0.9112    | 0.8391 |    0.8712       |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar)|
 | PP-FormulaNet-L | Vary_VIT_B | [rec_pp_formulanet_l.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml) |     0.9055   |     0.9206       | 0.9392  |     0.9273    | 0.9141 |     0.9213         |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_l_train.tar )|
 Among them, SPE and CPE refer to the simple and complex formula datasets of UniMERNet, respectively. Easy, Middle, and Hard are simple (LaTeX code length 0-64), medium (LaTeX code length 64-256), and complex formula datasets (LaTeX code length 256+) built internally by PaddleX.
 ## 2. Environment
 Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md) to clone the project code.
 Furthermore, additional dependencies need to be installed:
 ```shell
 sudo apt-get update
 sudo apt-get install libmagickwand-dev
 pip install -r docs/algorithm/formula_recognition/requirements.txt
 ```
 ## 3. Model Training / Evaluation / Prediction
 Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
 Dataset Preparation:
 ```shell
 # download PaddleX official example dataset
 wget https://paddle-model-ecology.bj.bcebos.com/paddlex/data/ocr_rec_latexocr_dataset_example.tar
 tar -xf ocr_rec_latexocr_dataset_example.tar
 ```
 Download the Pre-trained Model:
 ```shell
 # download the PP-FormulaNet-S pretrained model
 wget https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar 
 tar -xf rec_ppformulanet_s_train.tar
 ```
 Training:
 Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
 ```shell
 #Single GPU training 
 python3 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
   -o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 #Multi GPU training, specify the gpu number through the --gpus parameter
 python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1   tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
        -o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 Evaluation:
 ```shell
 # GPU evaluation
 python3 tools/eval.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml -o \
 Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 Prediction:
 ```shell
 # The configuration file used for prediction must match the training
 python3 tools/infer_rec.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
  -o  Global.infer_img='./docs/datasets/images/pme_demo/0000099.png'\
   Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
 ```
 ## 4. FAQ
--- a/docs/algorithm/formula_recognition/algorithm_rec_unimernet.md
+++ b/docs/algorithm/formula_recognition/algorithm_rec_unimernet.md
@ -20,7 +20,6 @@
 此外，需要安装额外的依赖：
 ```shell
 apt-get install sudo
 sudo apt-get update
 sudo apt-get install libmagickwand-dev
 pip install -r docs/algorithm/formula_recognition/requirements.txt
@ -107,7 +106,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1   tools/tr
 ```
 python3  -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1   tools/train.py -c configs/rec/rec_unimernet.yml \
  -o Global.eval_batch_step=[0,{length_of_dataset//batch_size//4}] \
-  -o Global.pretrained_model=./pretrain_models/texify.pdparams
+   Global.pretrained_model=./pretrain_models/texify.pdparams
 ```
 ### 3.4 评估
--- a/docs/algorithm/formula_recognition/algorithm_rec_unimernet_en.md
+++ b/docs/algorithm/formula_recognition/algorithm_rec_unimernet_en.md
@ -21,7 +21,6 @@ Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to co
 Furthermore, additional dependencies need to be installed:
 ```shell
 apt-get install sudo
 sudo apt-get update
 sudo apt-get install libmagickwand-dev
 pip install -r docs/algorithm/formula_recognition/requirements.txt
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -372,6 +372,8 @@ nav:
    - 公式识别算法:
      - CAN: algorithm/formula_recognition/algorithm_rec_can.md
      - LaTeX-OCR: algorithm/formula_recognition/algorithm_rec_latex_ocr.md
      - UniMERNet: algorithm/formula_recognition/algorithm_rec_unimernet.md
      - PP-FormulaNet: algorithm/formula_recognition/algorithm_rec_ppformulanet.md
    - 端到端OCR算法:
      - PGNet: algorithm/end_to_end/algorithm_e2e_pgnet.md
    - 表格识别算法:
--- a/ppocr/losses/init.py
+++ b/ppocr/losses/init.py
@ -47,6 +47,7 @@ from .rec_parseq_loss import ParseQLoss
 from .rec_cppd_loss import CPPDLoss
 from .rec_latexocr_loss import LaTeXOCRLoss
 from .rec_unimernet_loss import UniMERNetLoss
 from .rec_ppformulanet_loss import PPFormulaNet_S_Loss, PPFormulaNet_L_Loss
 # cls loss
 from .cls_loss import ClsLoss
@ -111,6 +112,8 @@ def build_loss(config):
        "CPPDLoss",
        "LaTeXOCRLoss",
        "UniMERNetLoss",
        "PPFormulaNet_S_Loss",
        "PPFormulaNet_L_Loss",
    ]
    config = copy.deepcopy(config)
    module_name = config.pop("name")
--- a/ppocr/losses/rec_ppformulanet_loss.py
+++ b/ppocr/losses/rec_ppformulanet_loss.py
@ -0,0 +1,74 @@
 # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 import paddle.nn as nn
 class PPFormulaNet_S_Loss(nn.Layer):
    """
    PP=FormulaNet-S adopt CrossEntropyLoss for network training.
    """
    def __init__(self, vocab_size=50000, parallel_step=1):
        super(PPFormulaNet_S_Loss, self).__init__()
        self.ignore_index = -100
        self.vocab_size = vocab_size
        self.parallel_step = int(parallel_step)
        self.pad_token_id = 1
        # ignore padding characters during training
        self.cross = nn.CrossEntropyLoss(
            reduction="mean", ignore_index=self.ignore_index
        )
    def forward(self, preds, batch):
        logits, masked_label = preds
        word_loss = self.cross(
            paddle.reshape(logits, [-1, logits.shape[-1]]),
            paddle.reshape(masked_label[:, self.parallel_step :], [-1]),
        )
        loss = word_loss
        return {
            "loss": loss,
            "word_loss": word_loss,
        }
 class PPFormulaNet_L_Loss(nn.Layer):
    """
    PPFormulaNet_L adopt CrossEntropyLoss for network training.
    """
    def __init__(self, vocab_size=50000):
        super(PPFormulaNet_L_Loss, self).__init__()
        self.ignore_index = -100
        self.vocab_size = vocab_size
        self.pad_token_id = 1
        # ignore padding characters during training
        self.cross = nn.CrossEntropyLoss(
            reduction="mean", ignore_index=self.ignore_index
        )
    def forward(self, preds, batch):
        logits, masked_label = preds
        word_loss = self.cross(
            paddle.reshape(logits, [-1, logits.shape[-1]]),
            paddle.reshape(masked_label[:, 1:], [-1]),
        )
        loss = word_loss
        return {
            "loss": loss,
            "word_loss": word_loss,
        }
--- a/ppocr/modeling/backbones/init.py
+++ b/ppocr/modeling/backbones/init.py
@ -70,7 +70,8 @@ def build_backbone(config, model_type):
        from .rec_vit_parseq import ViTParseQ
        from .rec_repvit import RepSVTR
        from .rec_svtrv2 import SVTRv2
-        from .rec_vary_vit import Vary_VIT_B
+        from .rec_vary_vit import Vary_VIT_B, Vary_VIT_B_Formula
        from .rec_pphgnetv2 import PPHGNetV2_B4
        support_dict = [
            "MobileNetV1Enhance",
@ -99,6 +100,8 @@ def build_backbone(config, model_type):
            "HybridTransformer",
            "DonutSwinModel",
            "Vary_VIT_B",
            "PPHGNetV2_B4",
            "Vary_VIT_B_Formula",
        ]
    elif model_type == "e2e":
        from .e2e_resnet_vd_pg import ResNet
--- a/ppocr/modeling/backbones/rec_pphgnetv2.py
+++ b/ppocr/modeling/backbones/rec_pphgnetv2.py
--- a/ppocr/modeling/backbones/rec_vary_vit.py
+++ b/ppocr/modeling/backbones/rec_vary_vit.py
@ -27,6 +27,7 @@ from paddle.nn.initializer import (
    TruncatedNormal,
    XavierUniform,
 )
 from ppocr.modeling.backbones.rec_donut_swin import DonutSwinModelOutput
 zeros_ = Constant(value=0.0)
 ones_ = Constant(value=1.0)
@ -90,6 +91,7 @@ class ImageEncoderViT(nn.Layer):
        rel_pos_zero_init: bool = True,
        window_size: int = 0,
        global_attn_indexes: Tuple[int, ...] = (),
        is_formula: bool = False,
    ) -> None:
        """
        Args:
@ -168,6 +170,7 @@ class ImageEncoderViT(nn.Layer):
        self.net_3 = nn.Conv2D(
            512, 1024, kernel_size=3, stride=2, padding=1, bias_attr=False
        )
        self.is_formula = is_formula
    def forward(self, x):
        x = self.patch_embed(x)
@ -177,6 +180,8 @@ class ImageEncoderViT(nn.Layer):
            x = blk(x)
        x = self.neck(x.transpose([0, 3, 1, 2]))
        x = self.net_2(x)
        if self.is_formula:
            x = self.net_3(x)
        return x
@ -492,6 +497,7 @@ def _build_vary(
    encoder_num_heads,
    encoder_global_attn_indexes,
    image_size,
    is_formula=False,
 ):
    prompt_embed_dim = 256
    vit_patch_size = 16
@ -509,6 +515,7 @@ def _build_vary(
        global_attn_indexes=encoder_global_attn_indexes,
        window_size=14,
        out_chans=prompt_embed_dim,
        is_formula=is_formula,
    )
    return image_encoder
@ -543,3 +550,67 @@ class Vary_VIT_B(nn.Layer):
        cnn_feature = self.vision_tower_high(pixel_values)
        cnn_feature = cnn_feature.flatten(2).transpose([0, 2, 1])
        return cnn_feature
 class Vary_VIT_B_Formula(nn.Layer):
    def __init__(
        self,
        in_channels=3,
        image_size=768,
        encoder_embed_dim=768,
        encoder_depth=12,
        encoder_num_heads=12,
        encoder_global_attn_indexes=[2, 5, 8, 11],
    ):
        """
        Vary_VIT_B_Formula
        Args:
            in_channels (int): Number of input channels. Default is 3 (for RGB images).
            image_size (int): Size of the input image. Default is 768.
            encoder_embed_dim (int): Dimension of the encoder's embedding. Default is 768.
            encoder_depth (int): Number of layers (depth) in the encoder. Default is 12.
            encoder_num_heads (int): Number of attention heads in the encoder. Default is 12.
            encoder_global_attn_indexes (list): List of indices specifying which encoder layers use global attention. Default is [2, 5, 8, 11].
        Returns:
            model: nn.Layer. Specific `Vary_VIT_B_Formula` model with defined architecture.
        """
        super(Vary_VIT_B_Formula, self).__init__()
        self.vision_tower_high = _build_vary(
            encoder_embed_dim=encoder_embed_dim,
            encoder_depth=encoder_depth,
            encoder_num_heads=encoder_num_heads,
            encoder_global_attn_indexes=[2, 5, 8, 11],
            image_size=image_size,
            is_formula=True,
        )
        self.mm_projector_vary = nn.Linear(1024, 1024)
        self.out_channels = 1024
    def forward(self, input_data):
        if self.training:
            pixel_values, label, attention_mask = input_data
        else:
            if isinstance(input_data, list):
                pixel_values = input_data[0]
            else:
                pixel_values = input_data
        num_channels = pixel_values.shape[1]
        if num_channels == 1:
            pixel_values = paddle.repeat_interleave(pixel_values, repeats=3, axis=1)
        cnn_feature = self.vision_tower_high(pixel_values)
        cnn_feature = cnn_feature.flatten(2).transpose([0, 2, 1])
        cnn_feature = self.mm_projector_vary(cnn_feature)
        donut_swin_output = DonutSwinModelOutput(
            last_hidden_state=cnn_feature,
            pooler_output=None,
            hidden_states=None,
            attentions=None,
            reshaped_hidden_states=None,
        )
        if self.training:
            return donut_swin_output, label, attention_mask
        else:
            return donut_swin_output
--- a/ppocr/modeling/heads/init.py
+++ b/ppocr/modeling/heads/init.py
@ -45,6 +45,7 @@ def build_head(config):
    from .rec_parseq_head import ParseQHead
    from .rec_cppd_head import CPPDHead
    from .rec_unimernet_head import UniMERNetHead
    from .rec_ppformulanet_head import PPFormulaNet_Head
    # cls head
    from .cls_head import ClsHead
@ -89,6 +90,7 @@ def build_head(config):
        "ParseQHead",
        "CPPDHead",
        "UniMERNetHead",
        "PPFormulaNet_Head",
    ]
    if config["name"] == "DRRGHead":
--- a/ppocr/modeling/heads/rec_ppformulanet_head.py
+++ b/ppocr/modeling/heads/rec_ppformulanet_head.py
--- a/ppocr/modeling/heads/rec_unimernet_head.py
+++ b/ppocr/modeling/heads/rec_unimernet_head.py
@ -217,6 +217,8 @@ class MBartConfig(object):
        forced_eos_token_id=2,
        _attn_implementation="eager",
        hidden_size=1024,
        use_parallel=False,
        parallel_step=2,
        is_export=False,
        **kwargs,
    ):
@ -251,6 +253,8 @@ class MBartConfig(object):
        self.is_encoder_decoder = is_encoder_decoder
        self.forced_eos_token_id = forced_eos_token_id
        self._attn_implementation = _attn_implementation
        self.use_parallel = use_parallel
        self.parallel_step = parallel_step
        self.is_export = is_export
        super().__init__()
@ -310,6 +314,22 @@ class AttentionMaskConverter:
            [bsz, 1, tgt_len, tgt_len + past_key_values_length]
        )
    def to_4d_export(
        self,
        attention_mask_2d,
        query_length,
        dtype,
        key_value_length,
        is_export=False,
    ):
        input_shape = (attention_mask_2d.shape[0], query_length)
        expanded_attn_mask = self._expand_mask(
            attention_mask_2d, dtype, tgt_len=input_shape[-1]
        )
        expanded_4d_mask = expanded_attn_mask
        return expanded_4d_mask
    def to_4d(
        self,
        attention_mask_2d,
@ -321,7 +341,6 @@ class AttentionMaskConverter:
        input_shape = (attention_mask_2d.shape[0], query_length)
        causal_4d_mask = None
        if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
            if key_value_length is None:
                raise ValueError(
@ -375,6 +394,33 @@ def _prepare_4d_attention_mask(mask, dtype, tgt_len=None):
    return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
 def _prepare_4d_causal_attention_mask_export(
    attention_mask,
    input_shape,
    inputs_embeds,
    past_key_values_length,
    sliding_window=None,
    is_export=False,
 ):
    attn_mask_converter = AttentionMaskConverter(
        is_causal=True, sliding_window=sliding_window
    )
    key_value_length = input_shape[-1] + past_key_values_length
    shape = attention_mask.shape
    len_shape = len(shape)
    attention_mask = attn_mask_converter.to_4d_export(
        attention_mask,
        input_shape[-1],
        key_value_length=key_value_length,
        dtype=inputs_embeds.dtype,
        is_export=is_export,
    )
    return attention_mask
 def _prepare_4d_causal_attention_mask(
    attention_mask,
    input_shape,
@ -1681,7 +1727,7 @@ class CustomMBartDecoder(MBartDecoder):
            )
        else:
            if self.is_export:
-                attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask = _prepare_4d_causal_attention_mask_export(
                    attention_mask,
                    input_shape,
                    inputs_embeds,
@ -1721,6 +1767,7 @@ class CustomMBartDecoder(MBartDecoder):
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )
        if self.gradient_checkpointing and self.training:
            if use_cache:
                print(
@ -1828,7 +1875,6 @@ class CustomMBartDecoder(MBartDecoder):
                    ]
                    if v is not None
                )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
@ -2237,6 +2283,21 @@ class UniMERNetHead(nn.Layer):
        }
        return input_dict
    def prepare_inputs_for_generation_export(
        self,
        past_key_values=None,
        attention_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        input_dict = {
            "decoder_attention_mask": None,
            "use_cache": use_cache,
        }
        return input_dict
    def _extract_past_from_model_output(
        self, outputs: ModelOutput, standardize_cache_format: bool = False
    ):
@ -2434,9 +2495,10 @@ class UniMERNetHead(nn.Layer):
    @paddle.no_grad()
    def generate_export(
        self,
        encoder_outputs,
        model_kwargs,
    ):
-        batch_size = model_kwargs["encoder_outputs"]["last_hidden_state"].shape[0]
+        batch_size = encoder_outputs["last_hidden_state"].shape[0]
        generation_config = {
            "decoder_start_token_id": 0,
            "bos_token_id": 0,
@ -2447,26 +2509,33 @@ class UniMERNetHead(nn.Layer):
            decoder_start_token_id=generation_config["decoder_start_token_id"],
            bos_token_id=generation_config["bos_token_id"],
        )
        input_ids = input_ids.reshape([-1, 1])
        decoder_input_ids = input_ids
        model_kwargs["key use_cache"] = True
        batch_size, cur_len = input_ids.shape
        if "inputs_embeds" in model_kwargs:
            cur_len = model_kwargs["inputs_embeds"].shape[1]
-        model_kwargs["cache_position"] = paddle.arange(cur_len)
+        cache_position = paddle.arange(cur_len)
        pad_token_id = self.pad_token_id
        eos_token_id = [self.eos_token_id]
        eos_token = self.eos_token_id
        unfinished_sequences = paddle.ones([batch_size], dtype=paddle.int64)
        i_idx = paddle.full([], 0)
-
+        past_key_values = []
        for i in range(8):
            init_arr = paddle.zeros([batch_size, 16, 0, 64])
            paddle.jit.api.set_dynamic_shape(init_arr, [-1, -1, -1, -1])
            cache = (init_arr, init_arr, init_arr, init_arr)
            past_key_values.append(cache)
        idx = 0
        while i_idx < paddle.to_tensor(self.max_seq_len):
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            model_inputs = self.prepare_inputs_for_generation_export(
-            decoder_input_ids = model_inputs["decoder_input_ids"]
+                past_key_values=past_key_values, **model_kwargs
            )
            decoder_attention_mask = model_inputs["decoder_attention_mask"]
-            encoder_outputs = model_inputs["encoder_outputs"]
+            decoder_attention_mask = paddle.ones(input_ids.shape)
            past_key_values = model_inputs["past_key_values"]
            paddle.jit.api.set_dynamic_shape(decoder_input_ids, [-1, -1])
            paddle.jit.api.set_dynamic_shape(decoder_attention_mask, [-1, -1])
@ -2489,6 +2558,10 @@ class UniMERNetHead(nn.Layer):
                    1 - unfinished_sequences
                )
            input_ids = paddle.concat([input_ids, next_tokens.unsqueeze(1)], axis=-1)
            past_length = past_key_values[0][0].shape[2]
            decoder_input_ids = next_tokens.unsqueeze(1)
            past_key_values = outputs.past_key_values
            cache_position = cache_position[-1:] + 1
            unfinished_sequences = unfinished_sequences & ~self.stopping_criteria(
                input_ids
            ).cast(paddle.int64)
@ -2500,6 +2573,7 @@ class UniMERNetHead(nn.Layer):
                ).all()
            ):
                break
            i_idx += 1
        return input_ids
@ -2578,15 +2652,20 @@ class UniMERNetHead(nn.Layer):
        """
        if not self.training:
            encoder_outputs = inputs
            if self.is_export:
                model_kwargs = {
                    "output_attentions": False,
                    "output_hidden_states": False,
                    "use_cache": True,
                }
                word_pred = self.generate_export(encoder_outputs, model_kwargs)
            else:
                model_kwargs = {
                    "output_attentions": False,
                    "output_hidden_states": False,
                    "use_cache": True,
                    "encoder_outputs": encoder_outputs,
                }
            if self.is_export:
                word_pred = self.generate_export(model_kwargs)
            else:
                word_pred = self.generate(model_kwargs)
            return word_pred
--- a/ppocr/utils/export_model.py
+++ b/ppocr/utils/export_model.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -63,6 +63,12 @@ def dump_infer_config(config, path, logger):
            common_dynamic_shapes = {
                "x": [[1, 3, 224, 224], [1, 3, 448, 448], [8, 3, 1280, 1280]]
            }
        elif arch_config["algorithm"] == "UniMERNet":
            common_dynamic_shapes = {"x": [[1, 3, 192, 672]]}
        elif arch_config["algorithm"] == "PP-FormulaNet-L":
            common_dynamic_shapes = {"x": [[1, 3, 768, 768]]}
        elif arch_config["algorithm"] == "PP-FormulaNet-S":
            common_dynamic_shapes = {"x": [[1, 3, 384, 384]]}
        else:
            common_dynamic_shapes = None
@ -91,6 +97,25 @@ def dump_infer_config(config, path, logger):
            with open(tokenizer_file, encoding="utf-8") as tokenizer_config_handle:
                character_dict = json.load(tokenizer_config_handle)
                postprocess["character_dict"] = character_dict
    elif config["Architecture"].get("algorithm") in [
        "UniMERNet",
        "PP-FormulaNet-L",
        "PP-FormulaNet-S",
    ]:
        tokenizer_file = config["Global"].get("rec_char_dict_path")
        fast_tokenizer_file = os.path.join(tokenizer_file, "tokenizer.json")
        tokenizer_config_file = os.path.join(tokenizer_file, "tokenizer_config.json")
        postprocess["character_dict"] = {}
        if fast_tokenizer_file is not None:
            with open(fast_tokenizer_file, encoding="utf-8") as tokenizer_config_handle:
                character_dict = json.load(tokenizer_config_handle)
                postprocess["character_dict"]["fast_tokenizer_file"] = character_dict
        if tokenizer_config_file is not None:
            with open(
                tokenizer_config_file, encoding="utf-8"
            ) as tokenizer_config_handle:
                character_dict = json.load(tokenizer_config_handle)
                postprocess["character_dict"]["tokenizer_config_file"] = character_dict
    else:
        if config["Global"].get("character_dict_path") is not None:
            with open(config["Global"]["character_dict_path"], encoding="utf-8") as f:
@ -208,6 +233,31 @@ def dynamic_to_static(model, arch_config, logger, input_shape=None):
            paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"),
        ]
        model = to_static(model, input_spec=other_shape)
    elif arch_config["algorithm"] == "UniMERNet":
        model = paddle.jit.to_static(
            model,
            input_spec=[
                paddle.static.InputSpec(shape=[-1, 1, 192, 672], dtype="float32")
            ],
            full_graph=True,
        )
    elif arch_config["algorithm"] == "PP-FormulaNet-L":
        model = paddle.jit.to_static(
            model,
            input_spec=[
                paddle.static.InputSpec(shape=[-1, 1, 768, 768], dtype="float32")
            ],
            full_graph=True,
        )
    elif arch_config["algorithm"] == "PP-FormulaNet-S":
        model = paddle.jit.to_static(
            model,
            input_spec=[
                paddle.static.InputSpec(shape=[-1, 1, 384, 384], dtype="float32")
            ],
            full_graph=True,
        )
    elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]:
        input_spec = [
            paddle.static.InputSpec(shape=[None, 512], dtype="int64"),  # input_ids
@ -368,6 +418,14 @@ def export(config, base_model=None, save_path=None):
        config["Architecture"]["Backbone"]["is_predict"] = True
        config["Architecture"]["Backbone"]["is_export"] = True
        config["Architecture"]["Head"]["is_export"] = True
    if config["Architecture"].get("algorithm") in ["UniMERNet"]:
        config["Architecture"]["Backbone"]["is_export"] = True
        config["Architecture"]["Head"]["is_export"] = True
    if config["Architecture"].get("algorithm") in [
        "PP-FormulaNet-S",
        "PP-FormulaNet-L",
    ]:
        config["Architecture"]["Head"]["is_export"] = True
    if base_model is not None:
        model = base_model
        if isinstance(model, paddle.DataParallel):
--- a/tests/test_formula_model.py
+++ b/tests/test_formula_model.py
@ -9,7 +9,10 @@ import pytest
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))
 from ppocr.modeling.backbones.rec_donut_swin import DonutSwinModel, DonutSwinModelOutput
 from ppocr.modeling.backbones.rec_pphgnetv2 import PPHGNetV2_B4
 from ppocr.modeling.backbones.rec_vary_vit import Vary_VIT_B_Formula
 from ppocr.modeling.heads.rec_unimernet_head import UniMERNetHead
 from ppocr.modeling.heads.rec_ppformulanet_head import PPFormulaNet_Head
@pytest.fixture
@ -17,6 +20,16 @@ def sample_image():
    return paddle.randn([1, 1, 192, 672])
@pytest.fixture
 def sample_image_ppformulanet_s():
    return paddle.randn([1, 1, 384, 384])
@pytest.fixture
 def sample_image_ppformulanet_l():
    return paddle.randn([1, 1, 768, 768])
@pytest.fixture
 def encoder_feat():
    encoded_feat = paddle.randn([1, 126, 1024])
@ -25,6 +38,22 @@ def encoder_feat():
    )
@pytest.fixture
 def encoder_feat_ppformulanet_s():
    encoded_feat = paddle.randn([1, 144, 2048])
    return DonutSwinModelOutput(
        last_hidden_state=encoded_feat,
    )
@pytest.fixture
 def encoder_feat_ppformulanet_l():
    encoded_feat = paddle.randn([1, 144, 1024])
    return DonutSwinModelOutput(
        last_hidden_state=encoded_feat,
    )
 def test_unimernet_backbone(sample_image):
    """
    Test UniMERNet backbone.
@ -68,3 +97,99 @@ def test_unimernet_head(encoder_feat):
    with paddle.no_grad():
        result = head(encoder_feat)
        assert result.shape == [1, 6]
 def test_ppformulanet_s_backbone(sample_image_ppformulanet_s):
    """
    Test PP-FormulaNet-S backbone.
    Args:
        sample_image_ppformulanet_s: sample image to be processed.
    """
    backbone = PPHGNetV2_B4(
        class_num=1024,
    )
    backbone.eval()
    with paddle.no_grad():
        result = backbone(sample_image_ppformulanet_s)
        encoder_feat = result[0]
        assert encoder_feat.shape == [1, 144, 2048]
 def test_ppformulanet_s_head(encoder_feat_ppformulanet_s):
    """
    Test PP-FormulaNet-S head.
    Args:
        encoder_feat_ppformulanet_s: encoder feature from PP-FormulaNet-S backbone.
    """
    head = PPFormulaNet_Head(
        max_new_tokens=6,
        decoder_start_token_id=0,
        decoder_ffn_dim=1536,
        decoder_hidden_size=384,
        decoder_layers=2,
        temperature=0.2,
        do_sample=False,
        top_p=0.95,
        encoder_hidden_size=2048,
        is_export=False,
        length_aware=True,
        use_parallel=True,
        parallel_step=3,
    )
    head.eval()
    with paddle.no_grad():
        result = head(encoder_feat_ppformulanet_s)
        assert result.shape == [1, 9]
 def test_ppformulanet_l_backbone(sample_image_ppformulanet_l):
    """
    Test PP-FormulaNet-L backbone.
    Args:
        sample_image_ppformulanet_l: sample image to be processed.
    """
    backbone = Vary_VIT_B_Formula(
        image_size=768,
        encoder_embed_dim=768,
        encoder_depth=12,
        encoder_num_heads=12,
        encoder_global_attn_indexes=[2, 5, 8, 11],
    )
    backbone.eval()
    with paddle.no_grad():
        result = backbone(sample_image_ppformulanet_l)
        encoder_feat = result[0]
        assert encoder_feat.shape == [1, 144, 1024]
 def test_ppformulanet_l_head(encoder_feat_ppformulanet_l):
    """
    Test PP-FormulaNet-L head.
    Args:
        encoder_feat_ppformulanet_l: encoder feature from PP-FormulaNet-L Head.
    """
    head = PPFormulaNet_Head(
        max_new_tokens=6,
        decoder_start_token_id=0,
        decoder_ffn_dim=2048,
        decoder_hidden_size=512,
        decoder_layers=8,
        temperature=0.2,
        do_sample=False,
        top_p=0.95,
        encoder_hidden_size=1024,
        is_export=False,
        length_aware=False,
        use_parallel=False,
        parallel_step=0,
    )
    head.eval()
    with paddle.no_grad():
        result = head(encoder_feat_ppformulanet_l)
        assert result.shape == [1, 7]
--- a/tools/eval.py
+++ b/tools/eval.py
@ -111,6 +111,12 @@ def main():
        elif config["Architecture"]["algorithm"] == "UniMERNet":
            model_type = "unimernet"
            config["Metric"]["cal_blue_score"] = True
        elif config["Architecture"]["algorithm"] in [
            "PP-FormulaNet-S",
            "PP-FormulaNet-L",
        ]:
            model_type = "pp_formulanet"
            config["Metric"]["cal_blue_score"] = True
        else:
            model_type = config["Architecture"]["model_type"]
    else:
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@ -134,7 +134,11 @@ def main():
            logger.info("infer_img: {}".format(file))
            with open(file, "rb") as f:
                img = f.read()
-                if config["Architecture"]["algorithm"] in ["UniMERNet"]:
+                if config["Architecture"]["algorithm"] in [
                    "UniMERNet",
                    "PP-FormulaNet-S",
                    "PP-FormulaNet-L",
                ]:
                    data = {"image": img, "filename": file}
                else:
                    data = {"image": img}
@ -192,7 +196,12 @@ def main():
            elif isinstance(post_result, list) and isinstance(post_result[0], int):
                # for RFLearning CNT branch
                info = str(post_result[0])
-            elif config["Architecture"]["algorithm"] in ["LaTeXOCR", "UniMERNet"]:
+            elif config["Architecture"]["algorithm"] in [
                "LaTeXOCR",
                "UniMERNet",
                "PP-FormulaNet-S",
                "PP-FormulaNet-L",
            ]:
                info = str(post_result[0])
            else:
                if len(post_result[0]) >= 2:
--- a/tools/program.py
+++ b/tools/program.py
@ -333,7 +333,12 @@ def train(
                        preds = model(batch)
                    elif algorithm in ["CAN"]:
                        preds = model(batch[:3])
-                    elif algorithm in ["LaTeXOCR", "UniMERNet"]:
+                    elif algorithm in [
                        "LaTeXOCR",
                        "UniMERNet",
                        "PP-FormulaNet-S",
                        "PP-FormulaNet-L",
                    ]:
                        preds = model(batch)
                    else:
                        preds = model(images)
@ -350,7 +355,12 @@ def train(
                    preds = model(batch)
                elif algorithm in ["CAN"]:
                    preds = model(batch[:3])
-                elif algorithm in ["LaTeXOCR", "UniMERNet"]:
+                elif algorithm in [
                    "LaTeXOCR",
                    "UniMERNet",
                    "PP-FormulaNet-S",
                    "PP-FormulaNet-L",
                ]:
                    preds = model(batch)
                else:
                    preds = model(images)
@ -381,6 +391,10 @@ def train(
                    model_type = "unimernet"
                    post_result = post_process_class(preds[0], batch[1], mode="train")
                    eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
                elif algorithm in ["PP-FormulaNet-S", "PP-FormulaNet-L"]:
                    model_type = "pp_formulanet"
                    post_result = post_process_class(preds[0], batch[1], mode="train")
                    eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
                else:
                    if config["Loss"]["name"] in [
                        "MultiLoss",
@ -677,7 +691,7 @@ def eval(
                    preds = model(batch)
                elif model_type in ["can"]:
                    preds = model(batch[:3])
-                elif model_type in ["latexocr", "unimernet"]:
+                elif model_type in ["latexocr", "unimernet", "pp_formulanet"]:
                    preds = model(batch)
                elif model_type in ["sr"]:
                    preds = model(batch)
@ -705,7 +719,7 @@ def eval(
                eval_class(preds, batch_numpy)
            elif model_type in ["can"]:
                eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0))
-            elif model_type in ["latexocr", "unimernet"]:
+            elif model_type in ["latexocr", "unimernet", "pp_formulanet"]:
                post_result = post_process_class(preds, batch[1], "eval")
                eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
            else:
@ -855,6 +869,8 @@ def preprocess(is_train=False):
        "LaTeXOCR",
        "UniMERNet",
        "SLANeXt",
        "PP-FormulaNet-S",
        "PP-FormulaNet-L",
    ]
    if use_xpu: