Add pp formulanet (#14429)

* add ppformulanet

* rename loss

* modify doc

* add export code

* modify yaml for global ref
pull/14441/head
liuhongen1234567 2024-12-23 13:14:33 +08:00 committed by GitHub
parent 0697d248f8
commit d523388ed1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 3863 additions and 37 deletions

View File

@ -0,0 +1,117 @@
Global:
use_gpu: True
epoch_num: 10
log_smooth_window: 10
print_batch_step: 10
save_model_dir: ./output/rec/pp_formulanet_l/
save_epoch_step: 2
# evaluation is run every 417 iterations (1 epoch)(batch_size = 24) # max_seq_len: 1024
eval_batch_step: [0, 417 ]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/datasets/pme_demo/0000013.png
infer_mode: False
use_space_char: False
rec_char_dict_path: &rec_char_dict_path ppocr/utils/dict/unimernet_tokenizer
max_new_tokens: &max_new_tokens 1024
input_size: &input_size [768, 768]
save_res_path: ./output/rec/predicts_unimernet_latexocr.txt
allow_resize_largeImg: False
start_ema: True
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.999
weight_decay: 0.05
lr:
name: LinearWarmupCosine
learning_rate: 0.0001
Architecture:
model_type: rec
algorithm: PP-FormulaNet-L
in_channels: 3
Transform:
Backbone:
name: Vary_VIT_B_Formula
image_size: 768
encoder_embed_dim: 768
encoder_depth: 12
encoder_num_heads: 12
encoder_global_attn_indexes: [2, 5, 8, 11]
Head:
name: PPFormulaNet_Head
max_new_tokens: *max_new_tokens
decoder_start_token_id: 0
decoder_ffn_dim: 2048
decoder_hidden_size: 512
decoder_layers: 8
temperature: 0.2
do_sample: False
top_p: 0.95
encoder_hidden_size: 1024
is_export: False
length_aware: False
use_parallel: False
parallel_step: 0
Loss:
name: PPFormulaNet_L_Loss
PostProcess:
name: UniMERNetDecode
rec_char_dict_path: *rec_char_dict_path
Metric:
name: LaTeXOCRMetric
main_indicator: exp_rate
cal_blue_score: False
Train:
dataset:
name: SimpleDataSet
data_dir: ./ocr_rec_latexocr_dataset_example
label_file_list: ["./ocr_rec_latexocr_dataset_example/train.txt"]
transforms:
- UniMERNetImgDecode:
input_size: *input_size
- UniMERNetTrainTransform:
- LatexImageFormat:
- UniMERNetLabelEncode:
rec_char_dict_path: *rec_char_dict_path
max_seq_len: *max_new_tokens
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 6
num_workers: 0
collate_fn: UniMERNetCollator
Eval:
dataset:
name: SimpleDataSet
data_dir: ./ocr_rec_latexocr_dataset_example
label_file_list: ["./ocr_rec_latexocr_dataset_example/val.txt"]
transforms:
- UniMERNetImgDecode:
input_size: *input_size
- UniMERNetTestTransform:
- LatexImageFormat:
- UniMERNetLabelEncode:
max_seq_len: *max_new_tokens
rec_char_dict_path: *rec_char_dict_path
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask', 'filename']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 10
num_workers: 0
collate_fn: UniMERNetCollator

View File

@ -0,0 +1,115 @@
Global:
use_gpu: True
epoch_num: 20
log_smooth_window: 10
print_batch_step: 10
save_model_dir: ./output/rec/pp_formulanet_s/
save_epoch_step: 2
# evaluation is run every 179 iterations (1 epoch)(batch_size = 56) # max_seq_len: 1024
eval_batch_step: [0, 179]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/datasets/pme_demo/0000013.png
infer_mode: False
use_space_char: False
rec_char_dict_path: &rec_char_dict_path ppocr/utils/dict/unimernet_tokenizer
max_new_tokens: &max_new_tokens 1024
input_size: &input_size [384, 384]
save_res_path: ./output/rec/predicts_unimernet_latexocr.txt
allow_resize_largeImg: False
start_ema: True
Optimizer:
name: AdamW
beta1: 0.9
beta2: 0.999
weight_decay: 0.05
lr:
name: LinearWarmupCosine
learning_rate: 0.0001
Architecture:
model_type: rec
algorithm: PP-FormulaNet-S
in_channels: 3
Transform:
Backbone:
name: PPHGNetV2_B4
class_num: 1024
Head:
name: PPFormulaNet_Head
max_new_tokens: *max_new_tokens
decoder_start_token_id: 0
decoder_ffn_dim: 1536
decoder_hidden_size: 384
decoder_layers: 2
temperature: 0.2
do_sample: False
top_p: 0.95
encoder_hidden_size: 2048
is_export: False
length_aware: True
use_parallel: True,
parallel_step: 3
Loss:
name: PPFormulaNet_S_Loss
parallel_step: 3
PostProcess:
name: UniMERNetDecode
rec_char_dict_path: *rec_char_dict_path
Metric:
name: LaTeXOCRMetric
main_indicator: exp_rate
cal_blue_score: False
Train:
dataset:
name: SimpleDataSet
data_dir: ./ocr_rec_latexocr_dataset_example
label_file_list: ["./ocr_rec_latexocr_dataset_example/train.txt"]
transforms:
- UniMERNetImgDecode:
input_size: *input_size
- UniMERNetTrainTransform:
- LatexImageFormat:
- UniMERNetLabelEncode:
rec_char_dict_path: *rec_char_dict_path
max_seq_len: *max_new_tokens
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 14
num_workers: 0
collate_fn: UniMERNetCollator
Eval:
dataset:
name: SimpleDataSet
data_dir: ./ocr_rec_latexocr_dataset_example
label_file_list: ["./ocr_rec_latexocr_dataset_example/val.txt"]
transforms:
- UniMERNetImgDecode:
input_size: *input_size
- UniMERNetTestTransform:
- LatexImageFormat:
- UniMERNetLabelEncode:
max_seq_len: *max_new_tokens
rec_char_dict_path: *rec_char_dict_path
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask', 'filename']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 30
num_workers: 0
collate_fn: UniMERNetCollator

View File

@ -15,7 +15,9 @@ Global:
infer_img: doc/datasets/pme_demo/0000013.png
infer_mode: False
use_space_char: False
rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
rec_char_dict_path: &rec_char_dict_path ppocr/utils/dict/unimernet_tokenizer
input_size: &input_size [192, 672]
max_seq_len: &max_seq_len 1024
save_res_path: ./output/rec/predicts_unimernet_plus_config_latexocr.txt
allow_resize_largeImg: False
@ -59,7 +61,7 @@ Loss:
PostProcess:
name: UniMERNetDecode
rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
rec_char_dict_path: *rec_char_dict_path
Metric:
name: LaTeXOCRMetric
@ -73,12 +75,12 @@ Train:
label_file_list: ["./train_data/UniMERNet/train_unimernet_1M.txt"]
transforms:
- UniMERNetImgDecode:
input_size: [192, 672]
input_size: *input_size
- UniMERNetTrainTransform:
- UniMERNetImageFormat:
- UniMERNetLabelEncode:
rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
max_seq_len: 1024
rec_char_dict_path: *rec_char_dict_path
max_seq_len: *max_seq_len
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask']
loader:
@ -95,12 +97,12 @@ Eval:
label_file_list: ["./train_data/UniMERNet/test_unimernet_cpe.txt"]
transforms:
- UniMERNetImgDecode:
input_size: [192, 672]
input_size: *input_size
- UniMERNetTestTransform:
- UniMERNetImageFormat:
- UniMERNetLabelEncode:
max_seq_len: 1024
rec_char_dict_path: ppocr/utils/dict/unimernet_tokenizer
max_seq_len: *max_seq_len
rec_char_dict_path: *rec_char_dict_path
- KeepKeys:
keep_keys: ['image', 'label', 'attention_mask']
loader:

View File

@ -0,0 +1,94 @@
# 印刷数学公式识别算法-PP-FormulaNet
## 1. 算法简介
`PP-FormulaNet` 是百度飞桨自研的公式识别模型,采用 PaddleX 内部自建的 5百万数据集进行训练在对应测试集上的精度如下
| 模型 | 骨干网络 | 配置文件 | SPE-<br/>BLEU↑ | CPE-<br/>BLEU↑ | Easy-<br/>BLEU↑ | Middle-<br/>BLEU↑ | Hard-<br/>BLEU↑| Avg-<br/>BLEU↑ | 下载链接 |
|-----------|------------|------------------|:--------------:|:---------:|:----------:|:----------------:|:---------:|:-----------------:|:-----------------:|
| UniMERNet | Donut Swin | [rec_unimernet.yml](../../../configs/rec/rec_unimernet.yml) | 0.9187 | 0.9252 | 0.8658 | 0.8228 | 0.7740 | 0.8613 |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_unimernet_train.tar)|
| PP-FormulaNet-S | PPHGNetV2_B4 | [rec_pp_formulanet_s.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml) | 0.8694 | 0.8071 | 0.9294 | 0.9112 | 0.8391 | 0.8712 |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar)|
| PP-FormulaNet-L | Vary_VIT_B | [rec_pp_formulanet_l.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml) | 0.9055 | 0.9206 | 0.9392 | 0.9273 | 0.9141 | 0.9213 |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_l_train.tar )|
其中SPE、CPE为UniMERNet的简单公式数据集和复杂公式数据集Easy、Middle、Hard为PaddleX内部自建的简单公式数据集LaTeX 代码长度 0-64、中等公式数据集LaTeX 代码长度 64-256和复杂公式数据集LaTeX 代码长度 256+)。
## 2. 环境配置
请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。
此外,需要安装额外的依赖:
```shell
sudo apt-get update
sudo apt-get install libmagickwand-dev
pip install -r docs/algorithm/formula_recognition/requirements.txt
```
## 3. 模型训练、评估、预测
### 3.1 准备数据集
```shell
# 下载 PaddleX 官方示例数据集
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/data/ocr_rec_latexocr_dataset_example.tar
tar -xf ocr_rec_latexocr_dataset_example.tar
```
### 3.2 下载预训练模型
```shell
# 下载 PP-FormulaNet-S 预训练模型
wget https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar
tar -xf rec_ppformulanet_s_train.tar
```
### 3.3 模型训练
请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化训练 `PP-FormulaNet-S` 识别模型时需要**更换配置文件**为 `PP-FormulaNet-S` 的[配置文件](https://github.com/PaddlePaddle/PaddleOCR/blob/main/configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml)。
#### 启动训练
具体地,在完成数据准备后,便可以启动训练,训练命令如下:
```shell
#单卡训练 (默认训练方式)
python3 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
#多卡训练,通过--gpus参数指定卡号
python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
**注意:**
- 默认每训练 1个epoch179 次iteration进行1次评估若您更改训练的batch_size或更换数据集请在训练时作出如下修改
```
python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.eval_batch_step=[0,{length_of_dataset//batch_size//4}] \
Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
### 3.4 评估
可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar ),使用如下命令进行评估:
```shell
# 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型请注意修改路径和文件名为{path/to/weights}/{model_name}。
# demo 测试集评估
python3 tools/eval.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml -o \
Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
### 3.5 预测
使用如下命令进行单张图片预测:
```shell
# 注意将pretrained_model的路径设置为本地路径。
python3 tools/infer_rec.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.infer_img='./docs/datasets/images/pme_demo/0000099.png'\
Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
# 预测文件夹下所有图像时可修改infer_img为文件夹如 Global.infer_img='./doc/datasets/pme_demo/'。
```
## 4. FAQ

View File

@ -0,0 +1,78 @@
# PP-FormulaNet
## 1. Introduction
PP-FormulaNet is a formula recognition model independently developed by Baidu PaddlePaddle. It is trained on a self-built dataset of 5 million samples within PaddleX, achieving the following accuracy on the corresponding test set:
| Model | Backbone | config |SPE-<br/>BLEU↑ | CPE-<br/>BLEU↑ | Easy-<br/>BLEU↑ | Middle-<br/>BLEU↑ | Hard-<br/>BLEU↑| Avg-<br/>BLEU↑ | Download link |
|-----------|--------|---------------------------------------------------|:--------------:|:-----------------:|:----------:|:----------------:|:---------:|:-----------------:|:--------------:|
| UniMERNet | Donut Swin | [rec_unimernet.yml](../../../configs/rec/rec_unimernet.yml) | 0.9187 | 0.9252 | 0.8658 | 0.8228 | 0.7740 | 0.8613 |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_unimernet_train.tar)|
| PP-FormulaNet-S | PPHGNetV2_B4 | [rec_pp_formulanet_s.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml) | 0.8694 | 0.8071 | 0.9294 | 0.9112 | 0.8391 | 0.8712 |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar)|
| PP-FormulaNet-L | Vary_VIT_B | [rec_pp_formulanet_l.yml](../../../configs/rec/PP-FormuaNet/rec_pp_formulanet_l.yml) | 0.9055 | 0.9206 | 0.9392 | 0.9273 | 0.9141 | 0.9213 |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_l_train.tar )|
Among them, SPE and CPE refer to the simple and complex formula datasets of UniMERNet, respectively. Easy, Middle, and Hard are simple (LaTeX code length 0-64), medium (LaTeX code length 64-256), and complex formula datasets (LaTeX code length 256+) built internally by PaddleX.
## 2. Environment
Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md) to clone the project code.
Furthermore, additional dependencies need to be installed:
```shell
sudo apt-get update
sudo apt-get install libmagickwand-dev
pip install -r docs/algorithm/formula_recognition/requirements.txt
```
## 3. Model Training / Evaluation / Prediction
Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**.
Dataset Preparation:
```shell
# download PaddleX official example dataset
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/data/ocr_rec_latexocr_dataset_example.tar
tar -xf ocr_rec_latexocr_dataset_example.tar
```
Download the Pre-trained Model:
```shell
# download the PP-FormulaNet-S pretrained model
wget https://paddleocr.bj.bcebos.com/contribution/rec_ppformulanet_s_train.tar
tar -xf rec_ppformulanet_s_train.tar
```
Training:
Specifically, after the data preparation is completed, the training can be started. The training command is as follows:
```shell
#Single GPU training
python3 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
#Multi GPU training, specify the gpu number through the --gpus parameter
python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1 tools/train.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
Evaluation:
```shell
# GPU evaluation
python3 tools/eval.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml -o \
Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
Prediction:
```shell
# The configuration file used for prediction must match the training
python3 tools/infer_rec.py -c configs/rec/PP-FormuaNet/rec_pp_formulanet_s.yml \
-o Global.infer_img='./docs/datasets/images/pme_demo/0000099.png'\
Global.pretrained_model=./rec_ppformulanet_s_train/best_accuracy.pdparams
```
## 4. FAQ

View File

@ -20,7 +20,6 @@
此外,需要安装额外的依赖:
```shell
apt-get install sudo
sudo apt-get update
sudo apt-get install libmagickwand-dev
pip install -r docs/algorithm/formula_recognition/requirements.txt
@ -107,7 +106,7 @@ python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1 tools/tr
```
python3 -m paddle.distributed.launch --gpus '0,1,2,3' --ips=127.0.0.1 tools/train.py -c configs/rec/rec_unimernet.yml \
-o Global.eval_batch_step=[0,{length_of_dataset//batch_size//4}] \
-o Global.pretrained_model=./pretrain_models/texify.pdparams
Global.pretrained_model=./pretrain_models/texify.pdparams
```
### 3.4 评估

View File

@ -21,7 +21,6 @@ Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to co
Furthermore, additional dependencies need to be installed:
```shell
apt-get install sudo
sudo apt-get update
sudo apt-get install libmagickwand-dev
pip install -r docs/algorithm/formula_recognition/requirements.txt

View File

@ -372,6 +372,8 @@ nav:
- 公式识别算法:
- CAN: algorithm/formula_recognition/algorithm_rec_can.md
- LaTeX-OCR: algorithm/formula_recognition/algorithm_rec_latex_ocr.md
- UniMERNet: algorithm/formula_recognition/algorithm_rec_unimernet.md
- PP-FormulaNet: algorithm/formula_recognition/algorithm_rec_ppformulanet.md
- 端到端OCR算法:
- PGNet: algorithm/end_to_end/algorithm_e2e_pgnet.md
- 表格识别算法:

View File

@ -47,6 +47,7 @@ from .rec_parseq_loss import ParseQLoss
from .rec_cppd_loss import CPPDLoss
from .rec_latexocr_loss import LaTeXOCRLoss
from .rec_unimernet_loss import UniMERNetLoss
from .rec_ppformulanet_loss import PPFormulaNet_S_Loss, PPFormulaNet_L_Loss
# cls loss
from .cls_loss import ClsLoss
@ -111,6 +112,8 @@ def build_loss(config):
"CPPDLoss",
"LaTeXOCRLoss",
"UniMERNetLoss",
"PPFormulaNet_S_Loss",
"PPFormulaNet_L_Loss",
]
config = copy.deepcopy(config)
module_name = config.pop("name")

View File

@ -0,0 +1,74 @@
# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
class PPFormulaNet_S_Loss(nn.Layer):
"""
PP=FormulaNet-S adopt CrossEntropyLoss for network training.
"""
def __init__(self, vocab_size=50000, parallel_step=1):
super(PPFormulaNet_S_Loss, self).__init__()
self.ignore_index = -100
self.vocab_size = vocab_size
self.parallel_step = int(parallel_step)
self.pad_token_id = 1
# ignore padding characters during training
self.cross = nn.CrossEntropyLoss(
reduction="mean", ignore_index=self.ignore_index
)
def forward(self, preds, batch):
logits, masked_label = preds
word_loss = self.cross(
paddle.reshape(logits, [-1, logits.shape[-1]]),
paddle.reshape(masked_label[:, self.parallel_step :], [-1]),
)
loss = word_loss
return {
"loss": loss,
"word_loss": word_loss,
}
class PPFormulaNet_L_Loss(nn.Layer):
"""
PPFormulaNet_L adopt CrossEntropyLoss for network training.
"""
def __init__(self, vocab_size=50000):
super(PPFormulaNet_L_Loss, self).__init__()
self.ignore_index = -100
self.vocab_size = vocab_size
self.pad_token_id = 1
# ignore padding characters during training
self.cross = nn.CrossEntropyLoss(
reduction="mean", ignore_index=self.ignore_index
)
def forward(self, preds, batch):
logits, masked_label = preds
word_loss = self.cross(
paddle.reshape(logits, [-1, logits.shape[-1]]),
paddle.reshape(masked_label[:, 1:], [-1]),
)
loss = word_loss
return {
"loss": loss,
"word_loss": word_loss,
}

View File

@ -70,7 +70,8 @@ def build_backbone(config, model_type):
from .rec_vit_parseq import ViTParseQ
from .rec_repvit import RepSVTR
from .rec_svtrv2 import SVTRv2
from .rec_vary_vit import Vary_VIT_B
from .rec_vary_vit import Vary_VIT_B, Vary_VIT_B_Formula
from .rec_pphgnetv2 import PPHGNetV2_B4
support_dict = [
"MobileNetV1Enhance",
@ -99,6 +100,8 @@ def build_backbone(config, model_type):
"HybridTransformer",
"DonutSwinModel",
"Vary_VIT_B",
"PPHGNetV2_B4",
"Vary_VIT_B_Formula",
]
elif model_type == "e2e":
from .e2e_resnet_vd_pg import ResNet

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,7 @@ from paddle.nn.initializer import (
TruncatedNormal,
XavierUniform,
)
from ppocr.modeling.backbones.rec_donut_swin import DonutSwinModelOutput
zeros_ = Constant(value=0.0)
ones_ = Constant(value=1.0)
@ -90,6 +91,7 @@ class ImageEncoderViT(nn.Layer):
rel_pos_zero_init: bool = True,
window_size: int = 0,
global_attn_indexes: Tuple[int, ...] = (),
is_formula: bool = False,
) -> None:
"""
Args:
@ -168,6 +170,7 @@ class ImageEncoderViT(nn.Layer):
self.net_3 = nn.Conv2D(
512, 1024, kernel_size=3, stride=2, padding=1, bias_attr=False
)
self.is_formula = is_formula
def forward(self, x):
x = self.patch_embed(x)
@ -177,6 +180,8 @@ class ImageEncoderViT(nn.Layer):
x = blk(x)
x = self.neck(x.transpose([0, 3, 1, 2]))
x = self.net_2(x)
if self.is_formula:
x = self.net_3(x)
return x
@ -492,6 +497,7 @@ def _build_vary(
encoder_num_heads,
encoder_global_attn_indexes,
image_size,
is_formula=False,
):
prompt_embed_dim = 256
vit_patch_size = 16
@ -509,6 +515,7 @@ def _build_vary(
global_attn_indexes=encoder_global_attn_indexes,
window_size=14,
out_chans=prompt_embed_dim,
is_formula=is_formula,
)
return image_encoder
@ -543,3 +550,67 @@ class Vary_VIT_B(nn.Layer):
cnn_feature = self.vision_tower_high(pixel_values)
cnn_feature = cnn_feature.flatten(2).transpose([0, 2, 1])
return cnn_feature
class Vary_VIT_B_Formula(nn.Layer):
def __init__(
self,
in_channels=3,
image_size=768,
encoder_embed_dim=768,
encoder_depth=12,
encoder_num_heads=12,
encoder_global_attn_indexes=[2, 5, 8, 11],
):
"""
Vary_VIT_B_Formula
Args:
in_channels (int): Number of input channels. Default is 3 (for RGB images).
image_size (int): Size of the input image. Default is 768.
encoder_embed_dim (int): Dimension of the encoder's embedding. Default is 768.
encoder_depth (int): Number of layers (depth) in the encoder. Default is 12.
encoder_num_heads (int): Number of attention heads in the encoder. Default is 12.
encoder_global_attn_indexes (list): List of indices specifying which encoder layers use global attention. Default is [2, 5, 8, 11].
Returns:
model: nn.Layer. Specific `Vary_VIT_B_Formula` model with defined architecture.
"""
super(Vary_VIT_B_Formula, self).__init__()
self.vision_tower_high = _build_vary(
encoder_embed_dim=encoder_embed_dim,
encoder_depth=encoder_depth,
encoder_num_heads=encoder_num_heads,
encoder_global_attn_indexes=[2, 5, 8, 11],
image_size=image_size,
is_formula=True,
)
self.mm_projector_vary = nn.Linear(1024, 1024)
self.out_channels = 1024
def forward(self, input_data):
if self.training:
pixel_values, label, attention_mask = input_data
else:
if isinstance(input_data, list):
pixel_values = input_data[0]
else:
pixel_values = input_data
num_channels = pixel_values.shape[1]
if num_channels == 1:
pixel_values = paddle.repeat_interleave(pixel_values, repeats=3, axis=1)
cnn_feature = self.vision_tower_high(pixel_values)
cnn_feature = cnn_feature.flatten(2).transpose([0, 2, 1])
cnn_feature = self.mm_projector_vary(cnn_feature)
donut_swin_output = DonutSwinModelOutput(
last_hidden_state=cnn_feature,
pooler_output=None,
hidden_states=None,
attentions=None,
reshaped_hidden_states=None,
)
if self.training:
return donut_swin_output, label, attention_mask
else:
return donut_swin_output

View File

@ -45,6 +45,7 @@ def build_head(config):
from .rec_parseq_head import ParseQHead
from .rec_cppd_head import CPPDHead
from .rec_unimernet_head import UniMERNetHead
from .rec_ppformulanet_head import PPFormulaNet_Head
# cls head
from .cls_head import ClsHead
@ -89,6 +90,7 @@ def build_head(config):
"ParseQHead",
"CPPDHead",
"UniMERNetHead",
"PPFormulaNet_Head",
]
if config["name"] == "DRRGHead":

File diff suppressed because it is too large Load Diff

View File

@ -217,6 +217,8 @@ class MBartConfig(object):
forced_eos_token_id=2,
_attn_implementation="eager",
hidden_size=1024,
use_parallel=False,
parallel_step=2,
is_export=False,
**kwargs,
):
@ -251,6 +253,8 @@ class MBartConfig(object):
self.is_encoder_decoder = is_encoder_decoder
self.forced_eos_token_id = forced_eos_token_id
self._attn_implementation = _attn_implementation
self.use_parallel = use_parallel
self.parallel_step = parallel_step
self.is_export = is_export
super().__init__()
@ -310,6 +314,22 @@ class AttentionMaskConverter:
[bsz, 1, tgt_len, tgt_len + past_key_values_length]
)
def to_4d_export(
self,
attention_mask_2d,
query_length,
dtype,
key_value_length,
is_export=False,
):
input_shape = (attention_mask_2d.shape[0], query_length)
expanded_attn_mask = self._expand_mask(
attention_mask_2d, dtype, tgt_len=input_shape[-1]
)
expanded_4d_mask = expanded_attn_mask
return expanded_4d_mask
def to_4d(
self,
attention_mask_2d,
@ -321,7 +341,6 @@ class AttentionMaskConverter:
input_shape = (attention_mask_2d.shape[0], query_length)
causal_4d_mask = None
if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
if key_value_length is None:
raise ValueError(
@ -375,6 +394,33 @@ def _prepare_4d_attention_mask(mask, dtype, tgt_len=None):
return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
def _prepare_4d_causal_attention_mask_export(
attention_mask,
input_shape,
inputs_embeds,
past_key_values_length,
sliding_window=None,
is_export=False,
):
attn_mask_converter = AttentionMaskConverter(
is_causal=True, sliding_window=sliding_window
)
key_value_length = input_shape[-1] + past_key_values_length
shape = attention_mask.shape
len_shape = len(shape)
attention_mask = attn_mask_converter.to_4d_export(
attention_mask,
input_shape[-1],
key_value_length=key_value_length,
dtype=inputs_embeds.dtype,
is_export=is_export,
)
return attention_mask
def _prepare_4d_causal_attention_mask(
attention_mask,
input_shape,
@ -1681,7 +1727,7 @@ class CustomMBartDecoder(MBartDecoder):
)
else:
if self.is_export:
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask = _prepare_4d_causal_attention_mask_export(
attention_mask,
input_shape,
inputs_embeds,
@ -1721,6 +1767,7 @@ class CustomMBartDecoder(MBartDecoder):
hidden_states = nn.functional.dropout(
hidden_states, p=self.dropout, training=self.training
)
if self.gradient_checkpointing and self.training:
if use_cache:
print(
@ -1828,7 +1875,6 @@ class CustomMBartDecoder(MBartDecoder):
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_cache,
@ -2237,6 +2283,21 @@ class UniMERNetHead(nn.Layer):
}
return input_dict
def prepare_inputs_for_generation_export(
self,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
input_dict = {
"decoder_attention_mask": None,
"use_cache": use_cache,
}
return input_dict
def _extract_past_from_model_output(
self, outputs: ModelOutput, standardize_cache_format: bool = False
):
@ -2434,9 +2495,10 @@ class UniMERNetHead(nn.Layer):
@paddle.no_grad()
def generate_export(
self,
encoder_outputs,
model_kwargs,
):
batch_size = model_kwargs["encoder_outputs"]["last_hidden_state"].shape[0]
batch_size = encoder_outputs["last_hidden_state"].shape[0]
generation_config = {
"decoder_start_token_id": 0,
"bos_token_id": 0,
@ -2447,26 +2509,33 @@ class UniMERNetHead(nn.Layer):
decoder_start_token_id=generation_config["decoder_start_token_id"],
bos_token_id=generation_config["bos_token_id"],
)
input_ids = input_ids.reshape([-1, 1])
decoder_input_ids = input_ids
model_kwargs["key use_cache"] = True
batch_size, cur_len = input_ids.shape
if "inputs_embeds" in model_kwargs:
cur_len = model_kwargs["inputs_embeds"].shape[1]
model_kwargs["cache_position"] = paddle.arange(cur_len)
cache_position = paddle.arange(cur_len)
pad_token_id = self.pad_token_id
eos_token_id = [self.eos_token_id]
eos_token = self.eos_token_id
unfinished_sequences = paddle.ones([batch_size], dtype=paddle.int64)
i_idx = paddle.full([], 0)
past_key_values = []
for i in range(8):
init_arr = paddle.zeros([batch_size, 16, 0, 64])
paddle.jit.api.set_dynamic_shape(init_arr, [-1, -1, -1, -1])
cache = (init_arr, init_arr, init_arr, init_arr)
past_key_values.append(cache)
idx = 0
while i_idx < paddle.to_tensor(self.max_seq_len):
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
decoder_input_ids = model_inputs["decoder_input_ids"]
model_inputs = self.prepare_inputs_for_generation_export(
past_key_values=past_key_values, **model_kwargs
)
decoder_attention_mask = model_inputs["decoder_attention_mask"]
encoder_outputs = model_inputs["encoder_outputs"]
past_key_values = model_inputs["past_key_values"]
decoder_attention_mask = paddle.ones(input_ids.shape)
paddle.jit.api.set_dynamic_shape(decoder_input_ids, [-1, -1])
paddle.jit.api.set_dynamic_shape(decoder_attention_mask, [-1, -1])
@ -2489,6 +2558,10 @@ class UniMERNetHead(nn.Layer):
1 - unfinished_sequences
)
input_ids = paddle.concat([input_ids, next_tokens.unsqueeze(1)], axis=-1)
past_length = past_key_values[0][0].shape[2]
decoder_input_ids = next_tokens.unsqueeze(1)
past_key_values = outputs.past_key_values
cache_position = cache_position[-1:] + 1
unfinished_sequences = unfinished_sequences & ~self.stopping_criteria(
input_ids
).cast(paddle.int64)
@ -2500,6 +2573,7 @@ class UniMERNetHead(nn.Layer):
).all()
):
break
i_idx += 1
return input_ids
@ -2578,15 +2652,20 @@ class UniMERNetHead(nn.Layer):
"""
if not self.training:
encoder_outputs = inputs
model_kwargs = {
"output_attentions": False,
"output_hidden_states": False,
"use_cache": True,
"encoder_outputs": encoder_outputs,
}
if self.is_export:
word_pred = self.generate_export(model_kwargs)
model_kwargs = {
"output_attentions": False,
"output_hidden_states": False,
"use_cache": True,
}
word_pred = self.generate_export(encoder_outputs, model_kwargs)
else:
model_kwargs = {
"output_attentions": False,
"output_hidden_states": False,
"use_cache": True,
"encoder_outputs": encoder_outputs,
}
word_pred = self.generate(model_kwargs)
return word_pred

View File

@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -63,6 +63,12 @@ def dump_infer_config(config, path, logger):
common_dynamic_shapes = {
"x": [[1, 3, 224, 224], [1, 3, 448, 448], [8, 3, 1280, 1280]]
}
elif arch_config["algorithm"] == "UniMERNet":
common_dynamic_shapes = {"x": [[1, 3, 192, 672]]}
elif arch_config["algorithm"] == "PP-FormulaNet-L":
common_dynamic_shapes = {"x": [[1, 3, 768, 768]]}
elif arch_config["algorithm"] == "PP-FormulaNet-S":
common_dynamic_shapes = {"x": [[1, 3, 384, 384]]}
else:
common_dynamic_shapes = None
@ -91,6 +97,25 @@ def dump_infer_config(config, path, logger):
with open(tokenizer_file, encoding="utf-8") as tokenizer_config_handle:
character_dict = json.load(tokenizer_config_handle)
postprocess["character_dict"] = character_dict
elif config["Architecture"].get("algorithm") in [
"UniMERNet",
"PP-FormulaNet-L",
"PP-FormulaNet-S",
]:
tokenizer_file = config["Global"].get("rec_char_dict_path")
fast_tokenizer_file = os.path.join(tokenizer_file, "tokenizer.json")
tokenizer_config_file = os.path.join(tokenizer_file, "tokenizer_config.json")
postprocess["character_dict"] = {}
if fast_tokenizer_file is not None:
with open(fast_tokenizer_file, encoding="utf-8") as tokenizer_config_handle:
character_dict = json.load(tokenizer_config_handle)
postprocess["character_dict"]["fast_tokenizer_file"] = character_dict
if tokenizer_config_file is not None:
with open(
tokenizer_config_file, encoding="utf-8"
) as tokenizer_config_handle:
character_dict = json.load(tokenizer_config_handle)
postprocess["character_dict"]["tokenizer_config_file"] = character_dict
else:
if config["Global"].get("character_dict_path") is not None:
with open(config["Global"]["character_dict_path"], encoding="utf-8") as f:
@ -208,6 +233,31 @@ def dynamic_to_static(model, arch_config, logger, input_shape=None):
paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"),
]
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == "UniMERNet":
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[-1, 1, 192, 672], dtype="float32")
],
full_graph=True,
)
elif arch_config["algorithm"] == "PP-FormulaNet-L":
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[-1, 1, 768, 768], dtype="float32")
],
full_graph=True,
)
elif arch_config["algorithm"] == "PP-FormulaNet-S":
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[-1, 1, 384, 384], dtype="float32")
],
full_graph=True,
)
elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]:
input_spec = [
paddle.static.InputSpec(shape=[None, 512], dtype="int64"), # input_ids
@ -368,6 +418,14 @@ def export(config, base_model=None, save_path=None):
config["Architecture"]["Backbone"]["is_predict"] = True
config["Architecture"]["Backbone"]["is_export"] = True
config["Architecture"]["Head"]["is_export"] = True
if config["Architecture"].get("algorithm") in ["UniMERNet"]:
config["Architecture"]["Backbone"]["is_export"] = True
config["Architecture"]["Head"]["is_export"] = True
if config["Architecture"].get("algorithm") in [
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
config["Architecture"]["Head"]["is_export"] = True
if base_model is not None:
model = base_model
if isinstance(model, paddle.DataParallel):

View File

@ -9,7 +9,10 @@ import pytest
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))
from ppocr.modeling.backbones.rec_donut_swin import DonutSwinModel, DonutSwinModelOutput
from ppocr.modeling.backbones.rec_pphgnetv2 import PPHGNetV2_B4
from ppocr.modeling.backbones.rec_vary_vit import Vary_VIT_B_Formula
from ppocr.modeling.heads.rec_unimernet_head import UniMERNetHead
from ppocr.modeling.heads.rec_ppformulanet_head import PPFormulaNet_Head
@pytest.fixture
@ -17,6 +20,16 @@ def sample_image():
return paddle.randn([1, 1, 192, 672])
@pytest.fixture
def sample_image_ppformulanet_s():
return paddle.randn([1, 1, 384, 384])
@pytest.fixture
def sample_image_ppformulanet_l():
return paddle.randn([1, 1, 768, 768])
@pytest.fixture
def encoder_feat():
encoded_feat = paddle.randn([1, 126, 1024])
@ -25,6 +38,22 @@ def encoder_feat():
)
@pytest.fixture
def encoder_feat_ppformulanet_s():
encoded_feat = paddle.randn([1, 144, 2048])
return DonutSwinModelOutput(
last_hidden_state=encoded_feat,
)
@pytest.fixture
def encoder_feat_ppformulanet_l():
encoded_feat = paddle.randn([1, 144, 1024])
return DonutSwinModelOutput(
last_hidden_state=encoded_feat,
)
def test_unimernet_backbone(sample_image):
"""
Test UniMERNet backbone.
@ -68,3 +97,99 @@ def test_unimernet_head(encoder_feat):
with paddle.no_grad():
result = head(encoder_feat)
assert result.shape == [1, 6]
def test_ppformulanet_s_backbone(sample_image_ppformulanet_s):
"""
Test PP-FormulaNet-S backbone.
Args:
sample_image_ppformulanet_s: sample image to be processed.
"""
backbone = PPHGNetV2_B4(
class_num=1024,
)
backbone.eval()
with paddle.no_grad():
result = backbone(sample_image_ppformulanet_s)
encoder_feat = result[0]
assert encoder_feat.shape == [1, 144, 2048]
def test_ppformulanet_s_head(encoder_feat_ppformulanet_s):
"""
Test PP-FormulaNet-S head.
Args:
encoder_feat_ppformulanet_s: encoder feature from PP-FormulaNet-S backbone.
"""
head = PPFormulaNet_Head(
max_new_tokens=6,
decoder_start_token_id=0,
decoder_ffn_dim=1536,
decoder_hidden_size=384,
decoder_layers=2,
temperature=0.2,
do_sample=False,
top_p=0.95,
encoder_hidden_size=2048,
is_export=False,
length_aware=True,
use_parallel=True,
parallel_step=3,
)
head.eval()
with paddle.no_grad():
result = head(encoder_feat_ppformulanet_s)
assert result.shape == [1, 9]
def test_ppformulanet_l_backbone(sample_image_ppformulanet_l):
"""
Test PP-FormulaNet-L backbone.
Args:
sample_image_ppformulanet_l: sample image to be processed.
"""
backbone = Vary_VIT_B_Formula(
image_size=768,
encoder_embed_dim=768,
encoder_depth=12,
encoder_num_heads=12,
encoder_global_attn_indexes=[2, 5, 8, 11],
)
backbone.eval()
with paddle.no_grad():
result = backbone(sample_image_ppformulanet_l)
encoder_feat = result[0]
assert encoder_feat.shape == [1, 144, 1024]
def test_ppformulanet_l_head(encoder_feat_ppformulanet_l):
"""
Test PP-FormulaNet-L head.
Args:
encoder_feat_ppformulanet_l: encoder feature from PP-FormulaNet-L Head.
"""
head = PPFormulaNet_Head(
max_new_tokens=6,
decoder_start_token_id=0,
decoder_ffn_dim=2048,
decoder_hidden_size=512,
decoder_layers=8,
temperature=0.2,
do_sample=False,
top_p=0.95,
encoder_hidden_size=1024,
is_export=False,
length_aware=False,
use_parallel=False,
parallel_step=0,
)
head.eval()
with paddle.no_grad():
result = head(encoder_feat_ppformulanet_l)
assert result.shape == [1, 7]

View File

@ -111,6 +111,12 @@ def main():
elif config["Architecture"]["algorithm"] == "UniMERNet":
model_type = "unimernet"
config["Metric"]["cal_blue_score"] = True
elif config["Architecture"]["algorithm"] in [
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
model_type = "pp_formulanet"
config["Metric"]["cal_blue_score"] = True
else:
model_type = config["Architecture"]["model_type"]
else:

View File

@ -134,7 +134,11 @@ def main():
logger.info("infer_img: {}".format(file))
with open(file, "rb") as f:
img = f.read()
if config["Architecture"]["algorithm"] in ["UniMERNet"]:
if config["Architecture"]["algorithm"] in [
"UniMERNet",
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
data = {"image": img, "filename": file}
else:
data = {"image": img}
@ -192,7 +196,12 @@ def main():
elif isinstance(post_result, list) and isinstance(post_result[0], int):
# for RFLearning CNT branch
info = str(post_result[0])
elif config["Architecture"]["algorithm"] in ["LaTeXOCR", "UniMERNet"]:
elif config["Architecture"]["algorithm"] in [
"LaTeXOCR",
"UniMERNet",
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
info = str(post_result[0])
else:
if len(post_result[0]) >= 2:

View File

@ -333,7 +333,12 @@ def train(
preds = model(batch)
elif algorithm in ["CAN"]:
preds = model(batch[:3])
elif algorithm in ["LaTeXOCR", "UniMERNet"]:
elif algorithm in [
"LaTeXOCR",
"UniMERNet",
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
preds = model(batch)
else:
preds = model(images)
@ -350,7 +355,12 @@ def train(
preds = model(batch)
elif algorithm in ["CAN"]:
preds = model(batch[:3])
elif algorithm in ["LaTeXOCR", "UniMERNet"]:
elif algorithm in [
"LaTeXOCR",
"UniMERNet",
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]:
preds = model(batch)
else:
preds = model(images)
@ -381,6 +391,10 @@ def train(
model_type = "unimernet"
post_result = post_process_class(preds[0], batch[1], mode="train")
eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
elif algorithm in ["PP-FormulaNet-S", "PP-FormulaNet-L"]:
model_type = "pp_formulanet"
post_result = post_process_class(preds[0], batch[1], mode="train")
eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
else:
if config["Loss"]["name"] in [
"MultiLoss",
@ -677,7 +691,7 @@ def eval(
preds = model(batch)
elif model_type in ["can"]:
preds = model(batch[:3])
elif model_type in ["latexocr", "unimernet"]:
elif model_type in ["latexocr", "unimernet", "pp_formulanet"]:
preds = model(batch)
elif model_type in ["sr"]:
preds = model(batch)
@ -705,7 +719,7 @@ def eval(
eval_class(preds, batch_numpy)
elif model_type in ["can"]:
eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0))
elif model_type in ["latexocr", "unimernet"]:
elif model_type in ["latexocr", "unimernet", "pp_formulanet"]:
post_result = post_process_class(preds, batch[1], "eval")
eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0))
else:
@ -855,6 +869,8 @@ def preprocess(is_train=False):
"LaTeXOCR",
"UniMERNet",
"SLANeXt",
"PP-FormulaNet-S",
"PP-FormulaNet-L",
]
if use_xpu: