Merge branch 'PaddlePaddle:dygraph' into dygraph

pull/4843/head
Evezerest 2021-12-06 17:45:37 +08:00 committed by GitHub
commit 2945abd703
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
156 changed files with 7358 additions and 1384 deletions

View File

@ -5,28 +5,36 @@ set -xe
function _set_params(){
run_mode=${1:-"sp"} # 单卡sp|多卡mp
batch_size=${2:-"64"}
fp_item=${3:-"fp32"} # fp32|fp16
max_iter=${4:-"10"} # 可选,如果需要修改代码提前中断
model_name=${5:-"model_name"}
fp_item=${3:-"fp32"} # fp32|fp16
max_epoch=${4:-"10"} # 可选,如果需要修改代码提前中断
model_item=${5:-"model_item"}
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 日志解析所需参数
base_batch_size=${batch_size}
mission_name="OCR"
direction_id="0"
ips_unit="images/sec"
skip_steps=2 # 解析日志有些模型前几个step耗时长需要跳过 (必填)
keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
index="1"
model_name=${model_item}_bs${batch_size}_${fp_item} # model_item 用于yml文件名匹配model_name 用于数据入库前端展示
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
}
function _train(){
echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
train_cmd="-c configs/det/${model_item}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_epoch} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
case ${run_mode} in
sp)
train_cmd="python3.7 tools/train.py "${train_cmd}""
train_cmd="python tools/train.py "${train_cmd}""
;;
mp)
train_cmd="python3.7 -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
@ -46,17 +54,7 @@ function _train(){
fi
}
function _analysis_log(){
analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${run_mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_size} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec"
eval $analysis_cmd
}
function _kill_process(){
kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'`
}
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
_train
_analysis_log
_kill_process
#_train # 如果只想产出训练log,不解析,可取消注释
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开

View File

@ -1,25 +1,35 @@
#!/bin/bash
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行 paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
# 执行目录: ./PaddleOCR
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
python3.7 -m pip install -r requirements.txt
log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
python -m pip install -r requirements.txt
# 2 拷贝该模型需要数据、预训练模型
wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
# 3 批量运行如不方便批量12需放到单个模型中
model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse)
fp_item_list=(fp32)
bs_list=(8 16)
for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do
if [ ${model_mode} == "det_r50_vd_east" ]; then
bs_list=(16)
else
bs_list=(8 16)
fi
for bs_item in ${bs_list[@]}; do
echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} # (5min)
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 1 ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode}
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
sleep 60
done
done

View File

@ -62,8 +62,7 @@ Loss:
weight: 0.05
num_classes: 6625
feat_dim: 96
init_center: false
center_file_path: "./train_center.pkl"
center_file_path:
# you can also try to add ace loss on your own dataset
# - ACELoss:
# weight: 0.1

View File

@ -34,10 +34,10 @@ PaddleOCR模型部署。
* 首先需要从opencv官网上下载在Linux环境下源码编译的包以opencv3.4.7为例,下载命令如下。
```
```bash
cd deploy/cpp_infer
wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
tar -xf 3.4.7.tar.gz
wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz
tar -xf opencv-3.4.7.tar.gz
```
最终可以在当前目录下看到`opencv-3.4.7/`的文件夹。
@ -45,12 +45,13 @@ tar -xf 3.4.7.tar.gz
* 编译opencv设置opencv源码路径(`root_path`)以及安装路径(`install_path`)。进入opencv源码路径下按照下面的方式进行编译。
```shell
root_path=your_opencv_root_path
root_path="your_opencv_root_path"
install_path=${root_path}/opencv3
build_dir=${root_path}/build
rm -rf build
mkdir build
cd build
rm -rf ${build_dir}
mkdir ${build_dir}
cd ${build_dir}
cmake .. \
-DCMAKE_INSTALL_PREFIX=${install_path} \
@ -74,6 +75,11 @@ make -j
make install
```
也可以直接修改`tools/build_opencv.sh`的内容,然后直接运行下面的命令进行编译。
```shell
sh tools/build_opencv.sh
```
其中`root_path`为下载的opencv源码路径`install_path`为opencv的安装路径`make install`完成之后会在该文件夹下生成opencv头文件和库文件用于后面的OCR代码编译。
@ -233,12 +239,12 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
--image_dir=../../doc/imgs/12.jpg
```
更多参数如下:
更多支持的可调节参数解释如下:
- 通用参数
|参数名称|类型|默认参数|意义|
| --- | --- | --- | --- |
| :---: | :---: | :---: | :---: |
|use_gpu|bool|false|是否使用GPU|
|gpu_id|int|0|GPU id使用GPU时有效|
|gpu_mem|int|4000|申请的GPU内存|
@ -248,7 +254,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
- 检测模型相关
|参数名称|类型|默认参数|意义|
| --- | --- | --- | --- |
| :---: | :---: | :---: | :---: |
|det_model_dir|string|-|检测模型inference model地址|
|max_side_len|int|960|输入图像长宽大于960时等比例缩放图像使得图像最长边为960|
|det_db_thresh|float|0.3|用于过滤DB预测的二值化图像设置为0.-0.3对结果影响不明显|
@ -260,7 +266,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
- 方向分类器相关
|参数名称|类型|默认参数|意义|
| --- | --- | --- | --- |
| :---: | :---: | :---: | :---: |
|use_angle_cls|bool|false|是否使用方向分类器|
|cls_model_dir|string|-|方向分类器inference model地址|
|cls_thresh|float|0.9|方向分类器的得分阈值|
@ -268,7 +274,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
- 识别模型相关
|参数名称|类型|默认参数|意义|
| --- | --- | --- | --- |
| :---: | :---: | :---: | :---: |
|rec_model_dir|string|-|识别模型inference model地址|
|char_list_file|string|../../ppocr/utils/ppocr_keys_v1.txt|字典文件|

View File

@ -17,10 +17,10 @@ PaddleOCR model deployment.
* First of all, you need to download the source code compiled package in the Linux environment from the opencv official website. Taking opencv3.4.7 as an example, the download command is as follows.
```
```bash
cd deploy/cpp_infer
wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
tar -xf 3.4.7.tar.gz
wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz
tar -xf opencv-3.4.7.tar.gz
```
Finally, you can see the folder of `opencv-3.4.7/` in the current directory.

View File

@ -0,0 +1,28 @@
root_path="/paddle/PaddleOCR/deploy/cpp_infer/opencv-3.4.7"
install_path=${root_path}/opencv3
build_dir=${root_path}/build
rm -rf ${build_dir}
mkdir ${build_dir}
cd ${build_dir}
cmake .. \
-DCMAKE_INSTALL_PREFIX=${install_path} \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_IPP=OFF \
-DBUILD_IPP_IW=OFF \
-DWITH_LAPACK=OFF \
-DWITH_EIGEN=OFF \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DWITH_ZLIB=ON \
-DBUILD_ZLIB=ON \
-DWITH_JPEG=ON \
-DBUILD_JPEG=ON \
-DWITH_PNG=ON \
-DBUILD_PNG=ON \
-DWITH_TIFF=ON \
-DBUILD_TIFF=ON
make -j
make install

View File

@ -172,7 +172,10 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
cv::Mat resize_img;
int index = 0;
std::vector<double> time_info = {0, 0, 0};
for (int i = boxes.size() - 1; i >= 0; i--) {
auto preprocess_start = std::chrono::steady_clock::now();
crop_img = GetRotateCropImage(srcimg, boxes[i]);
if (use_direction_classify >= 1) {
crop_img = RunClsModel(crop_img, predictor_cls);
@ -191,7 +194,9 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
auto *data0 = input_tensor0->mutable_data<float>();
NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
auto preprocess_end = std::chrono::steady_clock::now();
//// Run CRNN predictor
auto inference_start = std::chrono::steady_clock::now();
predictor_crnn->Run();
// Get output and run postprocess
@ -199,8 +204,10 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
std::move(predictor_crnn->GetOutput(0)));
auto *predict_batch = output_tensor0->data<float>();
auto predict_shape = output_tensor0->shape();
auto inference_end = std::chrono::steady_clock::now();
// ctc decode
auto postprocess_start = std::chrono::steady_clock::now();
std::string str_res;
int argmax_idx;
int last_index = 0;
@ -224,7 +231,20 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
score /= count;
rec_text.push_back(str_res);
rec_text_score.push_back(score);
auto postprocess_end = std::chrono::steady_clock::now();
std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;
time_info[0] += double(preprocess_diff.count() * 1000);
std::chrono::duration<float> inference_diff = inference_end - inference_start;
time_info[1] += double(inference_diff.count() * 1000);
std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;
time_info[2] += double(postprocess_diff.count() * 1000);
}
times->push_back(time_info[0]);
times->push_back(time_info[1]);
times->push_back(time_info[2]);
}
std::vector<std::vector<std::vector<int>>>
@ -312,7 +332,6 @@ std::shared_ptr<PaddlePredictor> loadModel(std::string model_file, int num_threa
config.set_model_from_file(model_file);
config.set_threads(num_threads);
std::shared_ptr<PaddlePredictor> predictor =
CreatePaddlePredictor<MobileConfig>(config);
return predictor;
@ -434,6 +453,9 @@ void system(char **argv){
auto rec_predictor = loadModel(rec_model_file, std::stoi(num_threads));
auto cls_predictor = loadModel(cls_model_file, std::stoi(num_threads));
std::vector<double> det_time_info = {0, 0, 0};
std::vector<double> rec_time_info = {0, 0, 0};
for (int i = 0; i < cv_all_img_names.size(); ++i) {
std::cout << "The predict img: " << cv_all_img_names[i] << std::endl;
cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR);
@ -459,8 +481,38 @@ void system(char **argv){
//// print recognized text
for (int i = 0; i < rec_text.size(); i++) {
std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
<< std::endl;
<< std::endl;
}
det_time_info[0] += det_times[0];
det_time_info[1] += det_times[1];
det_time_info[2] += det_times[2];
rec_time_info[0] += rec_times[0];
rec_time_info[1] += rec_times[1];
rec_time_info[2] += rec_times[2];
}
if (strcmp(argv[12], "True") == 0) {
AutoLogger autolog_det(det_model_file,
runtime_device,
std::stoi(num_threads),
std::stoi(batchsize),
"dynamic",
precision,
det_time_info,
cv_all_img_names.size());
AutoLogger autolog_rec(rec_model_file,
runtime_device,
std::stoi(num_threads),
std::stoi(batchsize),
"dynamic",
precision,
rec_time_info,
cv_all_img_names.size());
autolog_det.report();
std::cout << std::endl;
autolog_rec.report();
}
}
@ -503,15 +555,15 @@ void det(int argc, char **argv) {
auto img_vis = Visualization(srcimg, boxes);
std::cout << boxes.size() << " bboxes have detected:" << std::endl;
// for (int i=0; i<boxes.size(); i++){
// std::cout << "The " << i << " box:" << std::endl;
// for (int j=0; j<4; j++){
// for (int k=0; k<2; k++){
// std::cout << boxes[i][j][k] << "\t";
// }
// }
// std::cout << std::endl;
// }
for (int i=0; i<boxes.size(); i++){
std::cout << "The " << i << " box:" << std::endl;
for (int j=0; j<4; j++){
for (int k=0; k<2; k++){
std::cout << boxes[i][j][k] << "\t";
}
}
std::cout << std::endl;
}
time_info[0] += times[0];
time_info[1] += times[1];
time_info[2] += times[2];
@ -585,6 +637,9 @@ void rec(int argc, char **argv) {
std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
<< std::endl;
}
time_info[0] += times[0];
time_info[1] += times[1];
time_info[2] += times[2];
}
// TODO: support autolog
if (strcmp(argv[9], "True") == 0) {

View File

@ -52,12 +52,17 @@ def main(config, device, logger, vdl_writer):
config['Architecture']["Head"]['out_channels'] = char_num
model = build_model(config['Architecture'])
flops = paddle.flops(model, [1, 3, 640, 640])
logger.info(f"FLOPs before pruning: {flops}")
if config['Architecture']['model_type'] == 'det':
input_shape = [1, 3, 640, 640]
elif config['Architecture']['model_type'] == 'rec':
input_shape = [1, 3, 32, 320]
flops = paddle.flops(model, input_shape)
logger.info("FLOPs before pruning: {}".format(flops))
from paddleslim.dygraph import FPGMFilterPruner
model.train()
pruner = FPGMFilterPruner(model, [1, 3, 640, 640])
pruner = FPGMFilterPruner(model, input_shape)
# build metric
eval_class = build_metric(config['Metric'])
@ -65,8 +70,13 @@ def main(config, device, logger, vdl_writer):
def eval_fn():
metric = program.eval(model, valid_dataloader, post_process_class,
eval_class)
logger.info(f"metric['hmean']: {metric['hmean']}")
return metric['hmean']
if config['Architecture']['model_type'] == 'det':
main_indicator = 'hmean'
else:
main_indicator = 'acc'
logger.info("metric[{}]: {}".format(main_indicator, metric[
main_indicator]))
return metric[main_indicator]
params_sensitive = pruner.sensitive(
eval_func=eval_fn,
@ -81,18 +91,22 @@ def main(config, device, logger, vdl_writer):
# calculate pruned params's ratio
params_sensitive = pruner._get_ratios_by_loss(params_sensitive, loss=0.02)
for key in params_sensitive.keys():
logger.info(f"{key}, {params_sensitive[key]}")
logger.info("{}, {}".format(key, params_sensitive[key]))
plan = pruner.prune_vars(params_sensitive, [0])
flops = paddle.flops(model, [1, 3, 640, 640])
logger.info(f"FLOPs after pruning: {flops}")
flops = paddle.flops(model, input_shape)
logger.info("FLOPs after pruning: {}".format(flops))
# load pretrain model
load_model(config, model)
metric = program.eval(model, valid_dataloader, post_process_class,
eval_class)
logger.info(f"metric['hmean']: {metric['hmean']}")
if config['Architecture']['model_type'] == 'det':
main_indicator = 'hmean'
else:
main_indicator = 'acc'
logger.info("metric['']: {}".format(main_indicator, metric[main_indicator]))
# start export model
from paddle.jit import to_static

View File

@ -73,13 +73,18 @@ def main(config, device, logger, vdl_writer):
char_num = len(getattr(post_process_class, 'character'))
config['Architecture']["Head"]['out_channels'] = char_num
model = build_model(config['Architecture'])
if config['Architecture']['model_type'] == 'det':
input_shape = [1, 3, 640, 640]
elif config['Architecture']['model_type'] == 'rec':
input_shape = [1, 3, 32, 320]
flops = paddle.flops(model, input_shape)
flops = paddle.flops(model, [1, 3, 640, 640])
logger.info("FLOPs before pruning: {}".format(flops))
from paddleslim.dygraph import FPGMFilterPruner
model.train()
pruner = FPGMFilterPruner(model, [1, 3, 640, 640])
pruner = FPGMFilterPruner(model, input_shape)
# build loss
loss_class = build_loss(config['Loss'])
@ -107,8 +112,14 @@ def main(config, device, logger, vdl_writer):
def eval_fn():
metric = program.eval(model, valid_dataloader, post_process_class,
eval_class, False)
logger.info("metric['hmean']: {}".format(metric['hmean']))
return metric['hmean']
if config['Architecture']['model_type'] == 'det':
main_indicator = 'hmean'
else:
main_indicator = 'acc'
logger.info("metric[{}]: {}".format(main_indicator, metric[
main_indicator]))
return metric[main_indicator]
run_sensitive_analysis = False
"""
@ -149,7 +160,7 @@ def main(config, device, logger, vdl_writer):
plan = pruner.prune_vars(params_sensitive, [0])
flops = paddle.flops(model, [1, 3, 640, 640])
flops = paddle.flops(model, input_shape)
logger.info("FLOPs after pruning: {}".format(flops))
# start train

View File

@ -247,3 +247,7 @@ Q1: 训练模型转inference 模型之后预测效果不一致?
**A**此类问题出现较多问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。以det_mv3_db.yml配置文件训练的模型为例训练模型、inference模型预测结果不一致问题解决方式如下
- 检查[trained model预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116),和[inference model的预测预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42)函数是否一致。算法在评估的时候输入图像大小会影响精度为了和论文保持一致训练icdar15配置文件中将图像resize到[736, 1280]但是在inference model预测的时候只有一套默认参数会考虑到预测速度问题默认限制图像最长边为960做resize的。训练模型预处理和inference模型的预处理函数位于[ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147)
- 检查[trained model后处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51),和[inference 后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50)是否一致。
Q1: 训练EAST模型提示找不到lanms库
**A**执行pip3 install lanms-nova 即可。

View File

@ -34,6 +34,8 @@ inference 模型(`paddle.jit.save`保存的模型)
- [1. 超轻量中文OCR模型推理](#超轻量中文OCR模型推理)
- [2. 其他模型推理](#其他模型推理)
- [六、参数解释](#参数解释)
<a name="训练模型转inference模型"></a>
## 一、训练模型转inference模型
@ -394,3 +396,127 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --d
执行命令后,识别结果图像如下:
![](../imgs_results/img_10_east_starnet.jpg)
<a name="参数解释"></a>
# 六、参数解释
更多关于预测过程的参数解释如下所示。
* 全局信息
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| image_dir | str | 无,必须显式指定 | 图像或者文件夹路径 |
| vis_font_path | str | "./doc/fonts/simfang.ttf" | 用于可视化的字体路径 |
| drop_score | float | 0.5 | 识别得分小于该值的结果会被丢弃,不会作为返回结果 |
| use_pdserving | bool | False | 是否使用Paddle Serving进行预测 |
| warmup | bool | False | 是否开启warmup在统计预测耗时的时候可以使用这种方法 |
| draw_img_save_dir | str | "./inference_results" | 系统串联预测OCR结果的保存文件夹 |
| save_crop_res | bool | False | 是否保存OCR的识别文本图像 |
| crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 |
| use_mp | bool | False | 是否开启多进程预测 |
| total_process_num | int | 6 | 开启的进城数,`use_mp`为`True`时生效 |
| process_id | int | 0 | 当前进程的id号无需自己修改 |
| benchmark | bool | False | 是否开启benchmark对预测速度、显存占用等进行统计 |
| save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 |
| show_log | bool | True | 是否显示预测中的日志信息 |
| use_onnx | bool | False | 是否开启onnx预测 |
* 预测引擎相关
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| use_gpu | bool | True | 是否使用GPU进行预测 |
| ir_optim | bool | True | 是否对计算图进行分析与优化,开启后可以加速预测过程 |
| use_tensorrt | bool | False | 是否开启tensorrt |
| min_subgraph_size | int | 15 | tensorrt中最小子图size当子图的size大于该值时才会尝试对该子图使用trt engine计算 |
| precision | str | fp32 | 预测的精度,支持`fp32`, `fp16`, `int8` 3种输入 |
| enable_mkldnn | bool | True | 是否开启mkldnn |
| cpu_threads | int | 10 | 开启mkldnn时cpu预测的线程数 |
* 文本检测模型相关
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE` |
| det_model_dir | str | xx | 检测inference模型路径 |
| det_limit_side_len | int | 960 | 检测的图像边长限制 |
| det_limit_type | str | "max" | 检测的变成限制类型,目前支持`min`, `max``min`表示保证图像最短边不小于`det_limit_side_len``max`表示保证图像最长边不大于`det_limit_side_len` |
其中DB算法相关参数如下
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| det_db_thresh | float | 0.3 | DB输出的概率图中得分大于该阈值的像素点才会被认为是文字像素点 |
| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 |
| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 |
| max_batch_size | int | 10 | 预测的batch size |
| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 |
| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法支持`fast`和`slow``fast`是根据polygon的外接矩形边框内的所有像素计算平均得分`slow`是根据原始polygon内的所有像素计算平均得分计算速度相对较慢一些但是更加准确一些。 |
EAST算法相关参数如下
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| det_east_score_thresh | float | 0.8 | EAST后处理中score map的阈值 |
| det_east_cover_thresh | float | 0.1 | EAST后处理中文本框的平均得分阈值 |
| det_east_nms_thresh | float | 0.2 | EAST后处理中nms的阈值 |
SAST算法相关参数如下
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 |
| det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 |
| det_sast_polygon | bool | False | 是否多边形检测弯曲文本场景如Total-Text设置为True |
PSE算法相关参数如下
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 |
| det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值低于此阈值的丢弃 |
| det_pse_min_area | float | 16 | box的最小面积低于此阈值的丢弃 |
| det_pse_box_type | str | "box" | 返回框的类型box:四点坐标poly: 弯曲文本的所有点坐标 |
| det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`scale为2的情况下进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 |
* 文本识别模型相关
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR` |
| rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 |
| rec_image_shape | list | [3, 32, 320] | 识别时的图像尺寸, |
| rec_batch_num | int | 6 | 识别的batch size |
| max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 |
| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 |
| use_space_char | bool | True | 是否包含空格,如果为`True`,则会在最后字符字典中补充`空格`字符 |
* 端到端文本检测与识别模型相关
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| e2e_algorithm | str | "PGNet" | 端到端算法名称,目前支持`PGNet` |
| e2e_model_dir | str | 无,如果使用端到端模型,该项是必填项 | 端到端模型inference模型路径 |
| e2e_limit_side_len | int | 768 | 端到端的输入图像边长限制 |
| e2e_limit_type | str | "max" | 端到端的边长限制类型,目前支持`min`, `max``min`表示保证图像最短边不小于`e2e_limit_side_len``max`表示保证图像最长边不大于`e2e_limit_side_len` |
| e2e_pgnet_score_thresh | float | 0.5 | 端到端得分阈值,小于该阈值的结果会被丢弃 |
| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | 识别的字典文件路径 |
| e2e_pgnet_valid_set | str | "totaltext" | 验证集名称,目前支持`totaltext`, `partvgg`,不同数据集对应的后处理方式不同,与训练过程保持一致即可 |
| e2e_pgnet_mode | str | "fast" | PGNet的检测结果得分计算方法支持`fast`和`slow``fast`是根据polygon的外接矩形边框内的所有像素计算平均得分`slow`是根据原始polygon内的所有像素计算平均得分计算速度相对较慢一些但是更加准确一些。 |
* 方向分类器模型相关
| 参数名称 | 类型 | 默认值 | 含义 |
| :--: | :--: | :--: | :--: |
| use_angle_cls | bool | False | 是否使用方向分类器 |
| cls_model_dir | str | 无,如果需要使用,则必须显式指定路径 | 方向分类器inference模型路径 |
| cls_image_shape | list | [3, 48, 192] | 预测尺度 |
| label_list | list | ['0', '180'] | class id对应的角度值 |
| cls_batch_num | int | 6 | 方向分类器预测的batch size |
| cls_thresh | float | 0.9 | 预测阈值模型预测结果为180度且得分大于该阈值时认为最终预测结果为180度需要翻转 |

View File

@ -33,8 +33,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det|【最新】原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det|【最新】原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|

View File

@ -66,13 +66,13 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.
### 单张图像或者图像集合预测
```bash
# 预测image_dir指定的单张图像
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
# 预测image_dir指定的图像集合
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
# 如果想使用CPU进行预测需设置use_gpu参数为False
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True --use_gpu=False
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" --use_gpu=False
```
### 可视化结果
可视化文本检测结果默认保存到./inference_results文件夹里面结果文件的名称前缀为'e2e_res'。结果示例如下:
@ -167,9 +167,9 @@ python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img=
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar
python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e
```
**PGNet端到端模型推理需要设置参数`--e2e_algorithm="PGNet"`**,可以执行如下命令:
**PGNet端到端模型推理需要设置参数`--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**,可以执行如下命令:
```
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=False
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg" --e2e_pgnet_valid_set="totaltext"
```
可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下:
@ -178,9 +178,9 @@ python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/im
#### (2). 弯曲文本检测模型Total-Text
对于弯曲文本样例
**PGNet端到端模型推理需要设置参数`--e2e_algorithm="PGNet"`,同时,还需要增加参数`--e2e_pgnet_polygon=True`**可以执行如下命令:
**PGNet端到端模型推理需要设置参数`--e2e_algorithm="PGNet"`,同时,还需要增加参数`--e2e_pgnet_valid_set="totaltext"`**可以执行如下命令:
```
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext"
```
可视化文本端到端结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下:

View File

@ -29,8 +29,8 @@ Relationship of the above models is as follows.
|model name|description|config|model size|download|
| --- | --- | --- | --- | --- |
|ch_PP-OCRv2_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_PP-OCRv2_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|ch_PP-OCRv2_det|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|

View File

@ -59,13 +59,13 @@ After decompression, there should be the following file structure:
### Single image or image set prediction
```bash
# Prediction single image specified by image_dir
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
# Prediction the collection of images specified by image_dir
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
# If you want to use CPU for prediction, you need to set use_gpu parameter is false
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True --use_gpu=False
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --use_gpu=False --e2e_pgnet_valid_set="totaltext"
```
### Visualization results
The visualized end-to-end results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:
@ -166,9 +166,9 @@ First, convert the model saved in the PGNet end-to-end training process into an
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar
python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e
```
**For PGNet quadrangle end-to-end model inference, you need to set the parameter `--e2e_algorithm="PGNet"`**, run the following command:
**For PGNet quadrangle end-to-end model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**, run the following command:
```
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=False
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg"
```
The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:
@ -176,9 +176,9 @@ The visualized text detection results are saved to the `./inference_results` fol
#### (2). Curved text detection model (Total-Text)
For the curved text example, we use the same model as the quadrilateral
**For PGNet end-to-end curved text detection model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_polygon=True`**, run the following command:
**For PGNet end-to-end curved text detection model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="totaltext"`**, run the following command:
```
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=True
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext"
```
The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:

View File

@ -42,7 +42,7 @@ __all__ = [
]
SUPPORT_DET_MODEL = ['DB']
VERSION = '2.3.0.1'
VERSION = '2.3.0.2'
SUPPORT_REC_MODEL = ['CRNN']
BASE_DIR = os.path.expanduser("~/.paddleocr/")

View File

@ -30,21 +30,17 @@ class CenterLoss(nn.Layer):
Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
"""
def __init__(self,
num_classes=6625,
feat_dim=96,
init_center=False,
center_file_path=None):
def __init__(self, num_classes=6625, feat_dim=96, center_file_path=None):
super().__init__()
self.num_classes = num_classes
self.feat_dim = feat_dim
self.centers = paddle.randn(
shape=[self.num_classes, self.feat_dim]).astype("float64")
if init_center:
if center_file_path is not None:
assert os.path.exists(
center_file_path
), f"center path({center_file_path}) must exist when init_center is set as True."
), f"center path({center_file_path}) must exist when it is not None."
with open(center_file_path, 'rb') as f:
char_dict = pickle.load(f)
for key in char_dict.keys():

View File

@ -16,7 +16,7 @@ __all__ = ["build_backbone"]
def build_backbone(config, model_type):
if model_type == "det":
if model_type == "det" or model_type == "table":
from .det_mobilenet_v3 import MobileNetV3
from .det_resnet_vd import ResNet
from .det_resnet_vd_sast import ResNet_SAST
@ -36,10 +36,6 @@ def build_backbone(config, model_type):
elif model_type == "e2e":
from .e2e_resnet_vd_pg import ResNet
support_dict = ["ResNet"]
elif model_type == "table":
from .table_resnet_vd import ResNet
from .table_mobilenet_v3 import MobileNetV3
support_dict = ["ResNet", "MobileNetV3"]
else:
raise NotImplementedError

View File

@ -26,8 +26,10 @@ class MobileNetV3(nn.Layer):
scale=0.5,
large_stride=None,
small_stride=None,
disable_se=False,
**kwargs):
super(MobileNetV3, self).__init__()
self.disable_se = disable_se
if small_stride is None:
small_stride = [2, 2, 2, 2]
if large_stride is None:
@ -101,6 +103,7 @@ class MobileNetV3(nn.Layer):
block_list = []
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in cfg:
se = se and not self.disable_se
block_list.append(
ResidualUnit(
in_channels=inplanes,

View File

@ -1,287 +0,0 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
__all__ = ['MobileNetV3']
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class MobileNetV3(nn.Layer):
def __init__(self,
in_channels=3,
model_name='large',
scale=0.5,
disable_se=False,
**kwargs):
"""
the MobilenetV3 backbone network for detection module.
Args:
params(dict): the super parameters for build network
"""
super(MobileNetV3, self).__init__()
self.disable_se = disable_se
if model_name == "large":
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', 2],
[3, 72, 24, False, 'relu', 1],
[5, 72, 40, True, 'relu', 2],
[5, 120, 40, True, 'relu', 1],
[5, 120, 40, True, 'relu', 1],
[3, 240, 80, False, 'hardswish', 2],
[3, 200, 80, False, 'hardswish', 1],
[3, 184, 80, False, 'hardswish', 1],
[3, 184, 80, False, 'hardswish', 1],
[3, 480, 112, True, 'hardswish', 1],
[3, 672, 112, True, 'hardswish', 1],
[5, 672, 160, True, 'hardswish', 2],
[5, 960, 160, True, 'hardswish', 1],
[5, 960, 160, True, 'hardswish', 1],
]
cls_ch_squeeze = 960
elif model_name == "small":
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', 2],
[3, 72, 24, False, 'relu', 2],
[3, 88, 24, False, 'relu', 1],
[5, 96, 40, True, 'hardswish', 2],
[5, 240, 40, True, 'hardswish', 1],
[5, 240, 40, True, 'hardswish', 1],
[5, 120, 48, True, 'hardswish', 1],
[5, 144, 48, True, 'hardswish', 1],
[5, 288, 96, True, 'hardswish', 2],
[5, 576, 96, True, 'hardswish', 1],
[5, 576, 96, True, 'hardswish', 1],
]
cls_ch_squeeze = 576
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert scale in supported_scale, \
"supported scale are {} but input scale is {}".format(supported_scale, scale)
inplanes = 16
# conv1
self.conv = ConvBNLayer(
in_channels=in_channels,
out_channels=make_divisible(inplanes * scale),
kernel_size=3,
stride=2,
padding=1,
groups=1,
if_act=True,
act='hardswish',
name='conv1')
self.stages = []
self.out_channels = []
block_list = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in cfg:
se = se and not self.disable_se
start_idx = 2 if model_name == 'large' else 0
if s == 2 and i > start_idx:
self.out_channels.append(inplanes)
self.stages.append(nn.Sequential(*block_list))
block_list = []
block_list.append(
ResidualUnit(
in_channels=inplanes,
mid_channels=make_divisible(scale * exp),
out_channels=make_divisible(scale * c),
kernel_size=k,
stride=s,
use_se=se,
act=nl,
name="conv" + str(i + 2)))
inplanes = make_divisible(scale * c)
i += 1
block_list.append(
ConvBNLayer(
in_channels=inplanes,
out_channels=make_divisible(scale * cls_ch_squeeze),
kernel_size=1,
stride=1,
padding=0,
groups=1,
if_act=True,
act='hardswish',
name='conv_last'))
self.stages.append(nn.Sequential(*block_list))
self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
for i, stage in enumerate(self.stages):
self.add_sublayer(sublayer=stage, name="stage{}".format(i))
def forward(self, x):
x = self.conv(x)
out_list = []
for stage in self.stages:
x = stage(x)
out_list.append(x)
return out_list
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=None,
param_attr=ParamAttr(name=name + "_bn_scale"),
bias_attr=ParamAttr(name=name + "_bn_offset"),
moving_mean_name=name + "_bn_mean",
moving_variance_name=name + "_bn_variance")
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.if_act:
if self.act == "relu":
x = F.relu(x)
elif self.act == "hardswish":
x = F.hardswish(x)
else:
print("The activation function({}) is selected incorrectly.".
format(self.act))
exit()
return x
class ResidualUnit(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
kernel_size,
stride,
use_se,
act=None,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_channels == out_channels
self.if_se = use_se
self.expand_conv = ConvBNLayer(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=kernel_size,
stride=stride,
padding=int((kernel_size - 1) // 2),
groups=mid_channels,
if_act=True,
act=act,
name=name + "_depthwise")
if self.if_se:
self.mid_se = SEModule(mid_channels, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name=name + "_linear")
def forward(self, inputs):
x = self.expand_conv(inputs)
x = self.bottleneck_conv(x)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = paddle.add(inputs, x)
return x
class SEModule(nn.Layer):
def __init__(self, in_channels, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
self.conv1 = nn.Conv2D(
in_channels=in_channels,
out_channels=in_channels // reduction,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(name=name + "_1_weights"),
bias_attr=ParamAttr(name=name + "_1_offset"))
self.conv2 = nn.Conv2D(
in_channels=in_channels // reduction,
out_channels=in_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(name + "_2_weights"),
bias_attr=ParamAttr(name=name + "_2_offset"))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
return inputs * outputs

View File

@ -1,280 +0,0 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
__all__ = ["ResNet"]
class ConvBNLayer(nn.Layer):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
is_vd_mode=False,
act=None,
name=None, ):
super(ConvBNLayer, self).__init__()
self.is_vd_mode = is_vd_mode
self._pool2d_avg = nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True)
self._conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
self._batch_norm = nn.BatchNorm(
out_channels,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def forward(self, inputs):
if self.is_vd_mode:
inputs = self._pool2d_avg(inputs)
y = self._conv(inputs)
y = self._batch_norm(y)
return y
class BottleneckBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
name=name + "_branch2c")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels * 4,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first else True,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = paddle.add(x=short, y=conv2)
y = F.relu(y)
return y
class BasicBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BasicBlock, self).__init__()
self.stride = stride
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
act=None,
name=name + "_branch2b")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first else True,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = paddle.add(x=short, y=conv1)
y = F.relu(y)
return y
class ResNet(nn.Layer):
def __init__(self, in_channels=3, layers=50, **kwargs):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [18, 34, 50, 101, 152, 200]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(
supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_channels = [64, 256, 512,
1024] if layers >= 50 else [64, 64, 128, 256]
num_filters = [64, 128, 256, 512]
self.conv1_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=32,
kernel_size=3,
stride=2,
act='relu',
name="conv1_1")
self.conv1_2 = ConvBNLayer(
in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name="conv1_2")
self.conv1_3 = ConvBNLayer(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
act='relu',
name="conv1_3")
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.stages = []
self.out_channels = []
if layers >= 50:
for block in range(len(depth)):
block_list = []
shortcut = False
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
bottleneck_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
BottleneckBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block] * 4,
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name))
shortcut = True
block_list.append(bottleneck_block)
self.out_channels.append(num_filters[block] * 4)
self.stages.append(nn.Sequential(*block_list))
else:
for block in range(len(depth)):
block_list = []
shortcut = False
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
basic_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
BasicBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block],
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name))
shortcut = True
block_list.append(basic_block)
self.out_channels.append(num_filters[block])
self.stages.append(nn.Sequential(*block_list))
def forward(self, inputs):
y = self.conv1_1(inputs)
y = self.conv1_2(y)
y = self.conv1_3(y)
y = self.pool2d_max(y)
out = []
for block in self.stages:
y = block(y)
out.append(y)
return out

View File

@ -53,7 +53,6 @@ class AttentionHead(nn.Layer):
output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
output = paddle.concat(output_hiddens, axis=1)
probs = self.generator(output)
else:
targets = paddle.zeros(shape=[batch_size], dtype="int32")
probs = None
@ -75,6 +74,7 @@ class AttentionHead(nn.Layer):
probs_step, axis=1)], axis=1)
next_input = probs_step.argmax(axis=1)
targets = next_input
if not self.training:
probs = paddle.nn.functional.softmax(probs, axis=2)
return probs

View File

@ -53,7 +53,7 @@ def compute_partial_repr(input_points, control_points):
1]
repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist)
# fix numerical error for 0 * log(0), substitute all nan with 0
mask = repr_matrix != repr_matrix
mask = np.array(repr_matrix != repr_matrix)
repr_matrix[mask] = 0
return repr_matrix

View File

@ -29,6 +29,7 @@ class EASTPostProcess(object):
"""
The post process for EAST.
"""
def __init__(self,
score_thresh=0.8,
cover_thresh=0.1,
@ -38,11 +39,6 @@ class EASTPostProcess(object):
self.score_thresh = score_thresh
self.cover_thresh = cover_thresh
self.nms_thresh = nms_thresh
# c++ la-nms is faster, but only support python 3.5
self.is_python35 = False
if sys.version_info.major == 3 and sys.version_info.minor == 5:
self.is_python35 = True
def restore_rectangle_quad(self, origin, geometry):
"""
@ -64,6 +60,7 @@ class EASTPostProcess(object):
"""
restore text boxes from score map and geo map
"""
score_map = score_map[0]
geo_map = np.swapaxes(geo_map, 1, 0)
geo_map = np.swapaxes(geo_map, 1, 2)
@ -79,10 +76,14 @@ class EASTPostProcess(object):
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
boxes[:, :8] = text_box_restored.reshape((-1, 8))
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
if self.is_python35:
try:
import lanms
boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
else:
except:
print(
'you should install lanms by pip3 install lanms-nova to speed up nms_locality'
)
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
if boxes.shape[0] == 0:
return []
@ -139,4 +140,4 @@ class EASTPostProcess(object):
continue
boxes_norm.append(box)
dt_boxes_list.append({'points': np.array(boxes_norm)})
return dt_boxes_list
return dt_boxes_list

View File

@ -54,14 +54,37 @@ def load_model(config, model, optimizer=None):
pretrained_model = global_config.get('pretrained_model')
best_model_dict = {}
if checkpoints:
if checkpoints.endswith('pdparams'):
if checkpoints.endswith('.pdparams'):
checkpoints = checkpoints.replace('.pdparams', '')
assert os.path.exists(checkpoints + ".pdopt"), \
f"The {checkpoints}.pdopt does not exists!"
load_pretrained_params(model, checkpoints)
optim_dict = paddle.load(checkpoints + '.pdopt')
assert os.path.exists(checkpoints + ".pdparams"), \
"The {}.pdparams does not exists!".format(checkpoints)
# load params from trained model
params = paddle.load(checkpoints + '.pdparams')
state_dict = model.state_dict()
new_state_dict = {}
for key, value in state_dict.items():
if key not in params:
logger.warning("{} not in loaded params {} !".format(
key, params.keys()))
continue
pre_value = params[key]
if list(value.shape) == list(pre_value.shape):
new_state_dict[key] = pre_value
else:
logger.warning(
"The shape of model params {} {} not matched with loaded params shape {} !".
format(key, value.shape, pre_value.shape))
model.set_state_dict(new_state_dict)
if optimizer is not None:
optimizer.set_state_dict(optim_dict)
if os.path.exists(checkpoints + '.pdopt'):
optim_dict = paddle.load(checkpoints + '.pdopt')
optimizer.set_state_dict(optim_dict)
else:
logger.warning(
"{}.pdopt is not exists, params of optimizer is not loaded".
format(checkpoints))
if os.path.exists(checkpoints + '.states'):
with open(checkpoints + '.states', 'rb') as f:
@ -80,10 +103,10 @@ def load_model(config, model, optimizer=None):
def load_pretrained_params(model, path):
logger = get_logger()
if path.endswith('pdparams'):
if path.endswith('.pdparams'):
path = path.replace('.pdparams', '')
assert os.path.exists(path + ".pdparams"), \
f"The {path}.pdparams does not exists!"
"The {}.pdparams does not exists!".format(path)
params = paddle.load(path + '.pdparams')
state_dict = model.state_dict()
@ -92,11 +115,11 @@ def load_pretrained_params(model, path):
if list(state_dict[k1].shape) == list(params[k2].shape):
new_state_dict[k1] = params[k2]
else:
logger.info(
f"The shape of model params {k1} {state_dict[k1].shape} not matched with loaded params {k2} {params[k2].shape} !"
)
logger.warning(
"The shape of model params {} {} not matched with loaded params {} {} !".
format(k1, state_dict[k1].shape, k2, params[k2].shape))
model.set_state_dict(new_state_dict)
logger.info(f"load pretrain successful from {path}")
logger.info("load pretrain successful from {}".format(path))
return model

View File

@ -0,0 +1,182 @@
# 视觉问答VQA
VQA主要特性如下
- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)模型以及PP-OCR预测引擎。
- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取(比如判断问题对)
- 支持SER任务与OCR引擎联合的端到端系统预测与评估。
- 支持SER任务和RE任务的自定义训练
本项目是 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) 在 Paddle 2.2上的开源实现,
包含了在 [XFUND数据集](https://github.com/doc-analysis/XFUND) 上的微调代码。
## 1. 效果演示
**注意:** 测试图片来源于XFUN数据集。
### 1.1 SER
<div align="center">
<img src="./images/result_ser/zh_val_0_ser.jpg" width = "600" />
</div>
<div align="center">
<img src="./images/result_ser/zh_val_42_ser.jpg" width = "600" />
</div>
其中不同颜色的框表示不同的类别对于XFUN数据集有`QUESTION`, `ANSWER`, `HEADER` 3种类别在OCR检测框的左上方也标出了对应的类别和OCR识别结果。
### 1.2 RE
* Coming soon!
## 2. 安装
### 2.1 安装依赖
- **1) 安装PaddlePaddle**
```bash
pip3 install --upgrade pip
# GPU安装
python3 -m pip install paddlepaddle-gpu==2.2 -i https://mirror.baidu.com/pypi/simple
# CPU安装
python3 -m pip install paddlepaddle==2.2 -i https://mirror.baidu.com/pypi/simple
```
更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
### 2.2 安装PaddleOCR包含 PP-OCR 和 VQA
- **1pip快速安装PaddleOCR whl包仅预测**
```bash
pip install "paddleocr>=2.2" # 推荐使用2.2+版本
```
- **2下载VQA源码预测+训练)**
```bash
【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR
# 如果因为网络问题无法pull成功也可选择使用码云上的托管
git clone https://gitee.com/paddlepaddle/PaddleOCR
# 注码云托管代码可能无法实时同步本github项目更新存在3~5天延时请优先使用推荐方式。
```
- **3安装PaddleNLP**
```bash
# 需要使用PaddleNLP最新的代码版本进行安装
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
cd PaddleNLP
pip install -e .
```
- **4安装VQA的`requirements`**
```bash
pip install -r requirements.txt
```
## 3. 使用
### 3.1 数据和预训练模型准备
处理好的XFUN中文数据集下载地址[https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar)。
下载并解压该数据集,解压后将数据集放置在当前目录下。
```shell
wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar
```
如果希望转换XFUN中其他语言的数据集可以参考[XFUN数据转换脚本](helper/trans_xfun_data.py)。
如果希望直接体验预测过程可以下载我们提供的SER预训练模型跳过训练过程直接预测即可。
* SER任务预训练模型下载链接[链接](https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar)
* RE任务预训练模型下载链接coming soon!
### 3.2 SER任务
* 启动训练
```shell
python train_ser.py \
--model_name_or_path "layoutxlm-base-uncased" \
--train_data_dir "XFUND/zh_train/image" \
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
--eval_data_dir "XFUND/zh_val/image" \
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
--num_train_epochs 200 \
--eval_steps 10 \
--save_steps 500 \
--output_dir "./output/ser/" \
--learning_rate 5e-5 \
--warmup_steps 50 \
--evaluate_during_training \
--seed 2048
```
最终会打印出`precision`, `recall`, `f1`等指标,如下所示。
```
best metrics: {'loss': 1.066644651549203, 'precision': 0.8770182068017863, 'recall': 0.9361936193619362, 'f1': 0.9056402979780063}
```
模型和训练日志会保存在`./output/ser/`文件夹中。
* 使用评估集合中提供的OCR识别结果进行预测
```shell
export CUDA_VISIBLE_DEVICES=0
python3.7 infer_ser.py \
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
--output_dir "output_res/" \
--infer_imgs "XFUND/zh_val/image/" \
--ocr_json_path "XFUND/zh_val/xfun_normalize_val.json"
```
最终会在`output_res`目录下保存预测结果可视化图像以及预测结果文本文件,文件名为`infer_results.txt`。
* 使用`OCR引擎 + SER`串联结果
```shell
export CUDA_VISIBLE_DEVICES=0
python3.7 infer_ser_e2e.py \
--model_name_or_path "./output/PP-Layout_v1.0_ser_pretrained/" \
--max_seq_length 512 \
--output_dir "output_res_e2e/"
```
* 对`OCR引擎 + SER`预测系统进行端到端评估
```shell
export CUDA_VISIBLE_DEVICES=0
python helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
```
3.3 RE任务
coming soon!
## 参考链接
- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf
- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm
- XFUND dataset, https://github.com/doc-analysis/XFUND

View File

@ -0,0 +1,262 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import sys
# import Polygon
import shapely
from shapely.geometry import Polygon
import numpy as np
from collections import defaultdict
import operator
import editdistance
import argparse
import json
import copy
def parse_ser_results_fp(fp, fp_type="gt", ignore_background=True):
# img/zh_val_0.jpg {
# "height": 3508,
# "width": 2480,
# "ocr_info": [
# {"text": "Maribyrnong", "label": "other", "bbox": [1958, 144, 2184, 198]},
# {"text": "CITYCOUNCIL", "label": "other", "bbox": [2052, 183, 2171, 214]},
# ]
assert fp_type in ["gt", "pred"]
key = "label" if fp_type == "gt" else "pred"
res_dict = dict()
with open(fp, "r") as fin:
lines = fin.readlines()
for _, line in enumerate(lines):
img_path, info = line.strip().split("\t")
# get key
image_name = os.path.basename(img_path)
res_dict[image_name] = []
# get infos
json_info = json.loads(info)
for single_ocr_info in json_info["ocr_info"]:
label = single_ocr_info[key].upper()
if label in ["O", "OTHERS", "OTHER"]:
label = "O"
if ignore_background and label == "O":
continue
single_ocr_info["label"] = label
res_dict[image_name].append(copy.deepcopy(single_ocr_info))
return res_dict
def polygon_from_str(polygon_points):
"""
Create a shapely polygon object from gt or dt line.
"""
polygon_points = np.array(polygon_points).reshape(4, 2)
polygon = Polygon(polygon_points).convex_hull
return polygon
def polygon_iou(poly1, poly2):
"""
Intersection over union between two shapely polygons.
"""
if not poly1.intersects(
poly2): # this test is fast and can accelerate calculation
iou = 0
else:
try:
inter_area = poly1.intersection(poly2).area
union_area = poly1.area + poly2.area - inter_area
iou = float(inter_area) / union_area
except shapely.geos.TopologicalError:
# except Exception as e:
# print(e)
print('shapely.geos.TopologicalError occured, iou set to 0')
iou = 0
return iou
def ed(args, str1, str2):
if args.ignore_space:
str1 = str1.replace(" ", "")
str2 = str2.replace(" ", "")
if args.ignore_case:
str1 = str1.lower()
str2 = str2.lower()
return editdistance.eval(str1, str2)
def convert_bbox_to_polygon(bbox):
"""
bbox : [x1, y1, x2, y2]
output: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
"""
xmin, ymin, xmax, ymax = bbox
poly = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
return poly
def eval_e2e(args):
# gt
gt_results = parse_ser_results_fp(args.gt_json_path, "gt",
args.ignore_background)
# pred
dt_results = parse_ser_results_fp(args.pred_json_path, "pred",
args.ignore_background)
assert set(gt_results.keys()) == set(dt_results.keys())
iou_thresh = args.iou_thres
num_gt_chars = 0
gt_count = 0
dt_count = 0
hit = 0
ed_sum = 0
for img_name in gt_results:
gt_info = gt_results[img_name]
gt_count += len(gt_info)
dt_info = dt_results[img_name]
dt_count += len(dt_info)
dt_match = [False] * len(dt_info)
gt_match = [False] * len(gt_info)
all_ious = defaultdict(tuple)
# gt: {text, label, bbox or poly}
for index_gt, gt in enumerate(gt_info):
if "poly" not in gt:
gt["poly"] = convert_bbox_to_polygon(gt["bbox"])
gt_poly = polygon_from_str(gt["poly"])
for index_dt, dt in enumerate(dt_info):
if "poly" not in dt:
dt["poly"] = convert_bbox_to_polygon(dt["bbox"])
dt_poly = polygon_from_str(dt["poly"])
iou = polygon_iou(dt_poly, gt_poly)
if iou >= iou_thresh:
all_ious[(index_gt, index_dt)] = iou
sorted_ious = sorted(
all_ious.items(), key=operator.itemgetter(1), reverse=True)
sorted_gt_dt_pairs = [item[0] for item in sorted_ious]
# matched gt and dt
for gt_dt_pair in sorted_gt_dt_pairs:
index_gt, index_dt = gt_dt_pair
if gt_match[index_gt] == False and dt_match[index_dt] == False:
gt_match[index_gt] = True
dt_match[index_dt] = True
# ocr rec results
gt_text = gt_info[index_gt]["text"]
dt_text = dt_info[index_dt]["text"]
# ser results
gt_label = gt_info[index_gt]["label"]
dt_label = dt_info[index_dt]["pred"]
if True: # ignore_masks[index_gt] == '0':
ed_sum += ed(args, gt_text, dt_text)
num_gt_chars += len(gt_text)
if gt_text == dt_text:
if args.ignore_ser_prediction or gt_label == dt_label:
hit += 1
# unmatched dt
for tindex, dt_match_flag in enumerate(dt_match):
if dt_match_flag == False:
dt_text = dt_info[tindex]["text"]
gt_text = ""
ed_sum += ed(args, dt_text, gt_text)
# unmatched gt
for tindex, gt_match_flag in enumerate(gt_match):
if gt_match_flag == False:
dt_text = ""
gt_text = gt_info[tindex]["text"]
ed_sum += ed(args, gt_text, dt_text)
num_gt_chars += len(gt_text)
eps = 1e-9
print("config: ", args)
print('hit, dt_count, gt_count', hit, dt_count, gt_count)
precision = hit / (dt_count + eps)
recall = hit / (gt_count + eps)
fmeasure = 2.0 * precision * recall / (precision + recall + eps)
avg_edit_dist_img = ed_sum / len(gt_results)
avg_edit_dist_field = ed_sum / (gt_count + eps)
character_acc = 1 - ed_sum / (num_gt_chars + eps)
print('character_acc: %.2f' % (character_acc * 100) + "%")
print('avg_edit_dist_field: %.2f' % (avg_edit_dist_field))
print('avg_edit_dist_img: %.2f' % (avg_edit_dist_img))
print('precision: %.2f' % (precision * 100) + "%")
print('recall: %.2f' % (recall * 100) + "%")
print('fmeasure: %.2f' % (fmeasure * 100) + "%")
return
def parse_args():
"""
"""
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument(
"--gt_json_path",
default=None,
type=str,
required=True, )
parser.add_argument(
"--pred_json_path",
default=None,
type=str,
required=True, )
parser.add_argument("--iou_thres", default=0.5, type=float)
parser.add_argument(
"--ignore_case",
default=False,
type=str2bool,
help="whether to do lower case for the strs")
parser.add_argument(
"--ignore_space",
default=True,
type=str2bool,
help="whether to ignore space")
parser.add_argument(
"--ignore_background",
default=True,
type=str2bool,
help="whether to ignore other label")
parser.add_argument(
"--ignore_ser_prediction",
default=False,
type=str2bool,
help="whether to ignore ocr pred results")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
eval_e2e(args)

View File

@ -0,0 +1,52 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
def transfer_xfun_data(json_path=None, output_file=None):
with open(json_path, "r") as fin:
lines = fin.readlines()
json_info = json.loads(lines[0])
documents = json_info["documents"]
label_info = {}
with open(output_file, "w") as fout:
for idx, document in enumerate(documents):
img_info = document["img"]
document = document["document"]
image_path = img_info["fname"]
label_info["height"] = img_info["height"]
label_info["width"] = img_info["width"]
label_info["ocr_info"] = []
for doc in document:
label_info["ocr_info"].append({
"text": doc["text"],
"label": doc["label"],
"bbox": doc["box"],
"id": doc["id"],
"linking": doc["linking"],
"words": doc["words"]
})
fout.write(image_path + "\t" + json.dumps(
label_info, ensure_ascii=False) + "\n")
print("===ok====")
transfer_xfun_data("./xfun/zh.val.json", "./xfun_normalize_val.json")

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

View File

@ -0,0 +1,279 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import json
import cv2
import numpy as np
from copy import deepcopy
import paddle
# relative reference
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
def pad_sentences(tokenizer,
encoded_inputs,
max_seq_len=512,
pad_to_max_seq_len=True,
return_attention_mask=True,
return_token_type_ids=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
# Padding with larger size, reshape is carried out
max_seq_len = (
len(encoded_inputs["input_ids"]) // max_seq_len + 1) * max_seq_len
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs["input_ids"])
if tokenizer.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] +
[tokenizer.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs[
"input_ids"] + [tokenizer.pad_token_id] * difference
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [[0, 0, 0, 0]
] * difference
else:
assert False, f"padding_side of tokenizer just supports [\"right\"] but got {tokenizer.padding_side}"
else:
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"])
return encoded_inputs
def split_page(encoded_inputs, max_seq_len=512):
"""
truncate is often used in training process
"""
for key in encoded_inputs:
encoded_inputs[key] = paddle.to_tensor(encoded_inputs[key])
if encoded_inputs[key].ndim <= 1: # for input_ids, att_mask and so on
encoded_inputs[key] = encoded_inputs[key].reshape([-1, max_seq_len])
else: # for bbox
encoded_inputs[key] = encoded_inputs[key].reshape(
[-1, max_seq_len, 4])
return encoded_inputs
def preprocess(
tokenizer,
ori_img,
ocr_info,
img_size=(224, 224),
pad_token_label_id=-100,
max_seq_len=512,
add_special_ids=False,
return_attention_mask=True, ):
ocr_info = deepcopy(ocr_info)
height = ori_img.shape[0]
width = ori_img.shape[1]
img = cv2.resize(ori_img,
(224, 224)).transpose([2, 0, 1]).astype(np.float32)
segment_offset_id = []
words_list = []
bbox_list = []
input_ids_list = []
token_type_ids_list = []
for info in ocr_info:
# x1, y1, x2, y2
bbox = info["bbox"]
bbox[0] = int(bbox[0] * 1000.0 / width)
bbox[2] = int(bbox[2] * 1000.0 / width)
bbox[1] = int(bbox[1] * 1000.0 / height)
bbox[3] = int(bbox[3] * 1000.0 / height)
text = info["text"]
encode_res = tokenizer.encode(
text, pad_to_max_seq_len=False, return_attention_mask=True)
if not add_special_ids:
# TODO: use tok.all_special_ids to remove
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:-1]
encode_res["attention_mask"] = encode_res["attention_mask"][1:-1]
input_ids_list.extend(encode_res["input_ids"])
token_type_ids_list.extend(encode_res["token_type_ids"])
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
words_list.append(text)
segment_offset_id.append(len(input_ids_list))
encoded_inputs = {
"input_ids": input_ids_list,
"token_type_ids": token_type_ids_list,
"bbox": bbox_list,
"attention_mask": [1] * len(input_ids_list),
}
encoded_inputs = pad_sentences(
tokenizer,
encoded_inputs,
max_seq_len=max_seq_len,
return_attention_mask=return_attention_mask)
encoded_inputs = split_page(encoded_inputs)
fake_bs = encoded_inputs["input_ids"].shape[0]
encoded_inputs["image"] = paddle.to_tensor(img).unsqueeze(0).expand(
[fake_bs] + list(img.shape))
encoded_inputs["segment_offset_id"] = segment_offset_id
return encoded_inputs
def postprocess(attention_mask, preds, label_map_path):
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds = np.argmax(preds, axis=2)
_, label_map = get_bio_label_maps(label_map_path)
preds_list = [[] for _ in range(preds.shape[0])]
# keep batch info
for i in range(preds.shape[0]):
for j in range(preds.shape[1]):
if attention_mask[i][j] == 1:
preds_list[i].append(label_map[preds[i][j]])
return preds_list
def merge_preds_list_with_ocr_info(label_map_path, ocr_info, segment_offset_id,
preds_list):
# must ensure the preds_list is generated from the same image
preds = [p for pred in preds_list for p in pred]
label2id_map, _ = get_bio_label_maps(label_map_path)
for key in label2id_map:
if key.startswith("I-"):
label2id_map[key] = label2id_map["B" + key[1:]]
id2label_map = dict()
for key in label2id_map:
val = label2id_map[key]
if key == "O":
id2label_map[val] = key
if key.startswith("B-") or key.startswith("I-"):
id2label_map[val] = key[2:]
else:
id2label_map[val] = key
for idx in range(len(segment_offset_id)):
if idx == 0:
start_id = 0
else:
start_id = segment_offset_id[idx - 1]
end_id = segment_offset_id[idx]
curr_pred = preds[start_id:end_id]
curr_pred = [label2id_map[p] for p in curr_pred]
if len(curr_pred) <= 0:
pred_id = 0
else:
counts = np.bincount(curr_pred)
pred_id = np.argmax(counts)
ocr_info[idx]["pred_id"] = int(pred_id)
ocr_info[idx]["pred"] = id2label_map[pred_id]
return ocr_info
@paddle.no_grad()
def infer(args):
os.makedirs(args.output_dir, exist_ok=True)
# init token and model
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
# model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
model = LayoutXLMForTokenClassification.from_pretrained(
args.model_name_or_path)
model.eval()
# load ocr results json
ocr_results = dict()
with open(args.ocr_json_path, "r") as fin:
lines = fin.readlines()
for line in lines:
img_name, json_info = line.split("\t")
ocr_results[os.path.basename(img_name)] = json.loads(json_info)
# get infer img list
infer_imgs = get_image_file_list(args.infer_imgs)
# loop for infer
with open(os.path.join(args.output_dir, "infer_results.txt"), "w") as fout:
for idx, img_path in enumerate(infer_imgs):
print("process: [{}/{}]".format(idx, len(infer_imgs), img_path))
img = cv2.imread(img_path)
ocr_info = ocr_results[os.path.basename(img_path)]["ocr_info"]
inputs = preprocess(
tokenizer=tokenizer,
ori_img=img,
ocr_info=ocr_info,
max_seq_len=args.max_seq_length)
outputs = model(
input_ids=inputs["input_ids"],
bbox=inputs["bbox"],
image=inputs["image"],
token_type_ids=inputs["token_type_ids"],
attention_mask=inputs["attention_mask"])
preds = outputs[0]
preds = postprocess(inputs["attention_mask"], preds,
args.label_map_path)
ocr_info = merge_preds_list_with_ocr_info(
args.label_map_path, ocr_info, inputs["segment_offset_id"],
preds)
fout.write(img_path + "\t" + json.dumps(
{
"ocr_info": ocr_info,
}, ensure_ascii=False) + "\n")
img_res = draw_ser_results(img, ocr_info)
cv2.imwrite(
os.path.join(args.output_dir, os.path.basename(img_path)),
img_res)
return
if __name__ == "__main__":
args = parse_args()
infer(args)

View File

@ -0,0 +1,121 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import json
import cv2
import numpy as np
from copy import deepcopy
from PIL import Image
import paddle
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
# relative reference
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps, build_ocr_engine
from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
def trans_poly_to_bbox(poly):
x1 = np.min([p[0] for p in poly])
x2 = np.max([p[0] for p in poly])
y1 = np.min([p[1] for p in poly])
y2 = np.max([p[1] for p in poly])
return [x1, y1, x2, y2]
def parse_ocr_info_for_ser(ocr_result):
ocr_info = []
for res in ocr_result:
ocr_info.append({
"text": res[1][0],
"bbox": trans_poly_to_bbox(res[0]),
"poly": res[0],
})
return ocr_info
@paddle.no_grad()
def infer(args):
os.makedirs(args.output_dir, exist_ok=True)
# init token and model
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
model = LayoutXLMForTokenClassification.from_pretrained(
args.model_name_or_path)
model.eval()
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
label2id_map_for_draw = dict()
for key in label2id_map:
if key.startswith("I-"):
label2id_map_for_draw[key] = label2id_map["B" + key[1:]]
else:
label2id_map_for_draw[key] = label2id_map[key]
# get infer img list
infer_imgs = get_image_file_list(args.infer_imgs)
ocr_engine = build_ocr_engine(args.ocr_rec_model_dir,
args.ocr_det_model_dir)
# loop for infer
with open(os.path.join(args.output_dir, "infer_results.txt"), "w") as fout:
for idx, img_path in enumerate(infer_imgs):
print("process: [{}/{}]".format(idx, len(infer_imgs), img_path))
img = cv2.imread(img_path)
ocr_result = ocr_engine.ocr(img_path, cls=False)
ocr_info = parse_ocr_info_for_ser(ocr_result)
inputs = preprocess(
tokenizer=tokenizer,
ori_img=img,
ocr_info=ocr_info,
max_seq_len=args.max_seq_length)
outputs = model(
input_ids=inputs["input_ids"],
bbox=inputs["bbox"],
image=inputs["image"],
token_type_ids=inputs["token_type_ids"],
attention_mask=inputs["attention_mask"])
preds = outputs[0]
preds = postprocess(inputs["attention_mask"], preds, id2label_map)
ocr_info = merge_preds_list_with_ocr_info(
ocr_info, inputs["segment_offset_id"], preds,
label2id_map_for_draw)
fout.write(img_path + "\t" + json.dumps(
{
"ocr_info": ocr_info,
}, ensure_ascii=False) + "\n")
img_res = draw_ser_results(img, ocr_info)
cv2.imwrite(
os.path.join(args.output_dir,
os.path.splitext(os.path.basename(img_path))[0] +
"_ser.jpg"), img_res)
return
if __name__ == "__main__":
args = parse_args()
infer(args)

View File

@ -0,0 +1,3 @@
QUESTION
ANSWER
HEADER

View File

@ -0,0 +1,2 @@
sentencepiece
yacs

View File

@ -0,0 +1,313 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import random
import copy
import logging
import argparse
import paddle
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
from xfun import XFUNDataset
from utils import parse_args
from utils import get_bio_label_maps
logger = logging.getLogger(__name__)
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
paddle.seed(args.seed)
def train(args):
os.makedirs(args.output_dir, exist_ok=True)
logging.basicConfig(
filename=os.path.join(args.output_dir, "train.log")
if paddle.distributed.get_rank() == 0 else None,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO
if paddle.distributed.get_rank() == 0 else logging.WARN, )
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
# dist mode
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
model = LayoutXLMForTokenClassification(
base_model, num_classes=len(label2id_map), dropout=None)
# dist mode
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
train_dataset = XFUNDataset(
tokenizer,
data_dir=args.train_data_dir,
label_path=args.train_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
pad_token_label_id=pad_token_label_id,
contains_re=False,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
train_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
args.train_batch_size = args.per_gpu_train_batch_size * max(
1, paddle.distributed.get_world_size())
train_dataloader = paddle.io.DataLoader(
train_dataset,
batch_sampler=train_sampler,
num_workers=0,
use_shared_memory=True,
collate_fn=None, )
t_total = len(train_dataloader) * args.num_train_epochs
# build linear decay with warmup lr sch
lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
learning_rate=args.learning_rate,
decay_steps=t_total,
end_lr=0.0,
power=1.0)
if args.warmup_steps > 0:
lr_scheduler = paddle.optimizer.lr.LinearWarmup(
lr_scheduler,
args.warmup_steps,
start_lr=0,
end_lr=args.learning_rate, )
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
epsilon=args.adam_epsilon,
weight_decay=args.weight_decay)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d",
args.per_gpu_train_batch_size)
logger.info(
" Total train batch size (w. parallel, distributed) = %d",
args.train_batch_size * paddle.distributed.get_world_size(), )
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
tr_loss = 0.0
set_seed(args)
best_metrics = None
for epoch_id in range(args.num_train_epochs):
for step, batch in enumerate(train_dataloader):
model.train()
outputs = model(**batch)
# model outputs are always tuple in ppnlp (see doc)
loss = outputs[0]
loss = loss.mean()
logger.info(
"[epoch {}/{}][iter: {}/{}] lr: {:.5f}, train loss: {:.5f}, ".
format(epoch_id, args.num_train_epochs, step,
len(train_dataloader),
lr_scheduler.get_lr(), loss.numpy()[0]))
loss.backward()
tr_loss += loss.item()
optimizer.step()
lr_scheduler.step() # Update learning rate schedule
optimizer.clear_grad()
global_step += 1
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
global_step % args.eval_steps == 0):
# Log metrics
# Only evaluate when single GPU otherwise metrics may not average well
if paddle.distributed.get_rank(
) == 0 and args.evaluate_during_training:
results, _ = evaluate(
args,
model,
tokenizer,
label2id_map,
id2label_map,
pad_token_label_id, )
if best_metrics is None or results["f1"] >= best_metrics[
"f1"]:
best_metrics = copy.deepcopy(results)
output_dir = os.path.join(args.output_dir, "best_model")
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(
args,
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s",
output_dir)
logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
epoch_id, args.num_train_epochs, step,
len(train_dataloader), results))
if best_metrics is not None:
logger.info("best metrics: {}".format(best_metrics))
if paddle.distributed.get_rank(
) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir,
"checkpoint-{}".format(global_step))
os.makedirs(output_dir, exist_ok=True)
if paddle.distributed.get_rank() == 0:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
paddle.save(args,
os.path.join(output_dir, "training_args.bin"))
logger.info("Saving model checkpoint to %s", output_dir)
return global_step, tr_loss / global_step
def evaluate(args,
model,
tokenizer,
label2id_map,
id2label_map,
pad_token_label_id,
prefix=""):
eval_dataset = XFUNDataset(
tokenizer,
data_dir=args.eval_data_dir,
label_path=args.eval_label_path,
label2id_map=label2id_map,
img_size=(224, 224),
pad_token_label_id=pad_token_label_id,
contains_re=False,
add_special_ids=False,
return_attention_mask=True,
load_mode='all')
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
1, paddle.distributed.get_world_size())
eval_dataloader = paddle.io.DataLoader(
eval_dataset,
batch_size=args.eval_batch_size,
num_workers=0,
use_shared_memory=True,
collate_fn=None, )
# Eval!
logger.info("***** Running evaluation %s *****", prefix)
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
model.eval()
for idx, batch in enumerate(eval_dataloader):
with paddle.no_grad():
outputs = model(**batch)
tmp_eval_loss, logits = outputs[:2]
tmp_eval_loss = tmp_eval_loss.mean()
if paddle.distributed.get_rank() == 0:
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
eval_loss += tmp_eval_loss.item()
nb_eval_steps += 1
if preds is None:
preds = logits.numpy()
out_label_ids = batch["labels"].numpy()
else:
preds = np.append(preds, logits.numpy(), axis=0)
out_label_ids = np.append(
out_label_ids, batch["labels"].numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=2)
# label_map = {i: label.upper() for i, label in enumerate(labels)}
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]
for i in range(out_label_ids.shape[0]):
for j in range(out_label_ids.shape[1]):
if out_label_ids[i, j] != pad_token_label_id:
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
preds_list[i].append(id2label_map[preds[i][j]])
results = {
"loss": eval_loss,
"precision": precision_score(out_label_list, preds_list),
"recall": recall_score(out_label_list, preds_list),
"f1": f1_score(out_label_list, preds_list),
}
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
for lbl in out_label_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
for lbl in preds_list:
for l in lbl:
fout.write(l + "\t")
fout.write("\n")
report = classification_report(out_label_list, preds_list)
logger.info("\n" + report)
logger.info("***** Eval results %s *****", prefix)
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
return results, preds_list
def print_arguments(args):
"""print arguments"""
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).items()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
args = parse_args()
print_arguments(args)
train(args)

View File

@ -0,0 +1,328 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import cv2
import random
import numpy as np
import imghdr
from copy import deepcopy
import paddle
from PIL import Image, ImageDraw, ImageFont
from paddleocr import PaddleOCR
def get_bio_label_maps(label_map_path):
with open(label_map_path, "r") as fin:
lines = fin.readlines()
lines = [line.strip() for line in lines]
if "O" not in lines:
lines.insert(0, "O")
labels = []
for line in lines:
if line == "O":
labels.append("O")
else:
labels.append("B-" + line)
labels.append("I-" + line)
label2id_map = {label: idx for idx, label in enumerate(labels)}
id2label_map = {idx: label for idx, label in enumerate(labels)}
return label2id_map, id2label_map
def get_image_file_list(img_file):
imgs_lists = []
if img_file is None or not os.path.exists(img_file):
raise Exception("not found any img file in {}".format(img_file))
img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'GIF'}
if os.path.isfile(img_file) and imghdr.what(img_file) in img_end:
imgs_lists.append(img_file)
elif os.path.isdir(img_file):
for single_file in os.listdir(img_file):
file_path = os.path.join(img_file, single_file)
if os.path.isfile(file_path) and imghdr.what(file_path) in img_end:
imgs_lists.append(file_path)
if len(imgs_lists) == 0:
raise Exception("not found any img file in {}".format(img_file))
imgs_lists = sorted(imgs_lists)
return imgs_lists
def draw_ser_results(image,
ocr_results,
font_path="../doc/fonts/simfang.ttf",
font_size=18):
np.random.seed(0)
color = (np.random.permutation(range(255)),
np.random.permutation(range(255)),
np.random.permutation(range(255)))
color_map = {
idx: (color[0][idx], color[1][idx], color[2][idx])
for idx in range(1, 255)
}
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
img_new = image.copy()
draw = ImageDraw.Draw(img_new)
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
for ocr_info in ocr_results:
if ocr_info["pred_id"] not in color_map:
continue
color = color_map[ocr_info["pred_id"]]
# draw ocr results outline
bbox = ocr_info["bbox"]
bbox = ((bbox[0], bbox[1]), (bbox[2], bbox[3]))
draw.rectangle(bbox, fill=color)
# draw ocr results
text = "{}: {}".format(ocr_info["pred"], ocr_info["text"])
start_y = max(0, bbox[0][1] - font_size)
tw = font.getsize(text)[0]
draw.rectangle(
[(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1,
start_y + font_size)],
fill=(0, 0, 255))
draw.text(
(bbox[0][0] + 1, start_y), text, fill=(255, 255, 255), font=font)
img_new = Image.blend(image, img_new, 0.5)
return np.array(img_new)
def build_ocr_engine(rec_model_dir, det_model_dir):
ocr_engine = PaddleOCR(
rec_model_dir=rec_model_dir,
det_model_dir=det_model_dir,
use_angle_cls=False)
return ocr_engine
# pad sentences
def pad_sentences(tokenizer,
encoded_inputs,
max_seq_len=512,
pad_to_max_seq_len=True,
return_attention_mask=True,
return_token_type_ids=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
# Padding with larger size, reshape is carried out
max_seq_len = (
len(encoded_inputs["input_ids"]) // max_seq_len + 1) * max_seq_len
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs["input_ids"])
if tokenizer.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] +
[tokenizer.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs[
"input_ids"] + [tokenizer.pad_token_id] * difference
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [[0, 0, 0, 0]
] * difference
else:
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"])
return encoded_inputs
def split_page(encoded_inputs, max_seq_len=512):
"""
truncate is often used in training process
"""
for key in encoded_inputs:
encoded_inputs[key] = paddle.to_tensor(encoded_inputs[key])
if encoded_inputs[key].ndim <= 1: # for input_ids, att_mask and so on
encoded_inputs[key] = encoded_inputs[key].reshape([-1, max_seq_len])
else: # for bbox
encoded_inputs[key] = encoded_inputs[key].reshape(
[-1, max_seq_len, 4])
return encoded_inputs
def preprocess(
tokenizer,
ori_img,
ocr_info,
img_size=(224, 224),
pad_token_label_id=-100,
max_seq_len=512,
add_special_ids=False,
return_attention_mask=True, ):
ocr_info = deepcopy(ocr_info)
height = ori_img.shape[0]
width = ori_img.shape[1]
img = cv2.resize(ori_img,
(224, 224)).transpose([2, 0, 1]).astype(np.float32)
segment_offset_id = []
words_list = []
bbox_list = []
input_ids_list = []
token_type_ids_list = []
for info in ocr_info:
# x1, y1, x2, y2
bbox = info["bbox"]
bbox[0] = int(bbox[0] * 1000.0 / width)
bbox[2] = int(bbox[2] * 1000.0 / width)
bbox[1] = int(bbox[1] * 1000.0 / height)
bbox[3] = int(bbox[3] * 1000.0 / height)
text = info["text"]
encode_res = tokenizer.encode(
text, pad_to_max_seq_len=False, return_attention_mask=True)
if not add_special_ids:
# TODO: use tok.all_special_ids to remove
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:-1]
encode_res["attention_mask"] = encode_res["attention_mask"][1:-1]
input_ids_list.extend(encode_res["input_ids"])
token_type_ids_list.extend(encode_res["token_type_ids"])
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
words_list.append(text)
segment_offset_id.append(len(input_ids_list))
encoded_inputs = {
"input_ids": input_ids_list,
"token_type_ids": token_type_ids_list,
"bbox": bbox_list,
"attention_mask": [1] * len(input_ids_list),
}
encoded_inputs = pad_sentences(
tokenizer,
encoded_inputs,
max_seq_len=max_seq_len,
return_attention_mask=return_attention_mask)
encoded_inputs = split_page(encoded_inputs)
fake_bs = encoded_inputs["input_ids"].shape[0]
encoded_inputs["image"] = paddle.to_tensor(img).unsqueeze(0).expand(
[fake_bs] + list(img.shape))
encoded_inputs["segment_offset_id"] = segment_offset_id
return encoded_inputs
def postprocess(attention_mask, preds, id2label_map):
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds = np.argmax(preds, axis=2)
preds_list = [[] for _ in range(preds.shape[0])]
# keep batch info
for i in range(preds.shape[0]):
for j in range(preds.shape[1]):
if attention_mask[i][j] == 1:
preds_list[i].append(id2label_map[preds[i][j]])
return preds_list
def merge_preds_list_with_ocr_info(ocr_info, segment_offset_id, preds_list,
label2id_map_for_draw):
# must ensure the preds_list is generated from the same image
preds = [p for pred in preds_list for p in pred]
id2label_map = dict()
for key in label2id_map_for_draw:
val = label2id_map_for_draw[key]
if key == "O":
id2label_map[val] = key
if key.startswith("B-") or key.startswith("I-"):
id2label_map[val] = key[2:]
else:
id2label_map[val] = key
for idx in range(len(segment_offset_id)):
if idx == 0:
start_id = 0
else:
start_id = segment_offset_id[idx - 1]
end_id = segment_offset_id[idx]
curr_pred = preds[start_id:end_id]
curr_pred = [label2id_map_for_draw[p] for p in curr_pred]
if len(curr_pred) <= 0:
pred_id = 0
else:
counts = np.bincount(curr_pred)
pred_id = np.argmax(counts)
ocr_info[idx]["pred_id"] = int(pred_id)
ocr_info[idx]["pred"] = id2label_map[int(pred_id)]
return ocr_info
def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
# yapf: disable
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,)
parser.add_argument("--train_data_dir", default=None, type=str, required=False,)
parser.add_argument("--train_label_path", default=None, type=str, required=False,)
parser.add_argument("--eval_data_dir", default=None, type=str, required=False,)
parser.add_argument("--eval_label_path", default=None, type=str, required=False,)
parser.add_argument("--output_dir", default=None, type=str, required=True,)
parser.add_argument("--max_seq_length", default=512, type=int,)
parser.add_argument("--evaluate_during_training", action="store_true",)
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",)
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for eval.",)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.",)
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.",)
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.",)
parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.",)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.",)
parser.add_argument("--eval_steps", type=int, default=10, help="eval every X updates steps.",)
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.",)
parser.add_argument("--seed", type=int, default=2048, help="random seed for initialization",)
parser.add_argument("--ocr_rec_model_dir", default=None, type=str, )
parser.add_argument("--ocr_det_model_dir", default=None, type=str, )
parser.add_argument("--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
parser.add_argument("--infer_imgs", default=None, type=str, required=False)
parser.add_argument("--ocr_json_path", default=None, type=str, required=False, help="ocr prediction results")
# yapf: enable
args = parser.parse_args()
return args

View File

@ -0,0 +1,442 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import cv2
import numpy as np
import paddle
import copy
from paddle.io import Dataset
__all__ = ["XFUNDataset"]
class XFUNDataset(Dataset):
"""
Example:
print("=====begin to build dataset=====")
from paddlenlp.transformers import LayoutXLMTokenizer
tokenizer = LayoutXLMTokenizer.from_pretrained("/paddle/models/transformers/layoutxlm-base-paddle/")
tok_res = tokenizer.tokenize("Maribyrnong")
# res = tokenizer.convert_ids_to_tokens(val_data["input_ids"][0])
dataset = XfunDatasetForSer(
tokenizer,
data_dir="./zh.val/",
label_path="zh.val/xfun_normalize_val.json",
img_size=(224,224))
print(len(dataset))
data = dataset[0]
print(data.keys())
print("input_ids: ", data["input_ids"])
print("labels: ", data["labels"])
print("token_type_ids: ", data["token_type_ids"])
print("words_list: ", data["words_list"])
print("image shape: ", data["image"].shape)
"""
def __init__(self,
tokenizer,
data_dir,
label_path,
contains_re=False,
label2id_map=None,
img_size=(224, 224),
pad_token_label_id=None,
add_special_ids=False,
return_attention_mask=True,
load_mode='all',
max_seq_len=512):
super().__init__()
self.tokenizer = tokenizer
self.data_dir = data_dir
self.label_path = label_path
self.contains_re = contains_re
self.label2id_map = label2id_map
self.img_size = img_size
self.pad_token_label_id = pad_token_label_id
self.add_special_ids = add_special_ids
self.return_attention_mask = return_attention_mask
self.load_mode = load_mode
self.max_seq_len = max_seq_len
if self.pad_token_label_id is None:
self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
self.all_lines = self.read_all_lines()
self.entities_labels = {'HEADER': 0, 'QUESTION': 1, 'ANSWER': 2}
self.return_keys = {
'bbox': 'np',
'input_ids': 'np',
'labels': 'np',
'attention_mask': 'np',
'image': 'np',
'token_type_ids': 'np',
'entities': 'dict',
'relations': 'dict',
}
if load_mode == "all":
self.encoded_inputs_all = self._parse_label_file_all()
def pad_sentences(self,
encoded_inputs,
max_seq_len=512,
pad_to_max_seq_len=True,
return_attention_mask=True,
return_token_type_ids=True,
truncation_strategy="longest_first",
return_overflowing_tokens=False,
return_special_tokens_mask=False):
# Padding
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs["input_ids"])
if self.tokenizer.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] +
[self.tokenizer.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs[
"input_ids"] + [self.tokenizer.pad_token_id] * difference
encoded_inputs["labels"] = encoded_inputs[
"labels"] + [self.pad_token_label_id] * difference
encoded_inputs["bbox"] = encoded_inputs[
"bbox"] + [[0, 0, 0, 0]] * difference
elif self.tokenizer.padding_side == 'left':
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [
1
] * len(encoded_inputs["input_ids"])
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
[self.tokenizer.pad_token_type_id] * difference +
encoded_inputs["token_type_ids"])
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = [
1
] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [
self.tokenizer.pad_token_id
] * difference + encoded_inputs["input_ids"]
encoded_inputs["labels"] = [
self.pad_token_label_id
] * difference + encoded_inputs["labels"]
encoded_inputs["bbox"] = [
[0, 0, 0, 0]
] * difference + encoded_inputs["bbox"]
else:
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"])
return encoded_inputs
def truncate_inputs(self, encoded_inputs, max_seq_len=512):
for key in encoded_inputs:
if key == "sample_id":
continue
length = min(len(encoded_inputs[key]), max_seq_len)
encoded_inputs[key] = encoded_inputs[key][:length]
return encoded_inputs
def read_all_lines(self, ):
with open(self.label_path, "r") as fin:
lines = fin.readlines()
return lines
def _parse_label_file_all(self):
"""
parse all samples
"""
encoded_inputs_all = []
for line in self.all_lines:
encoded_inputs_all.extend(self._parse_label_file(line))
return encoded_inputs_all
def _parse_label_file(self, line):
"""
parse single sample
"""
image_name, info_str = line.split("\t")
image_path = os.path.join(self.data_dir, image_name)
def add_imgge_path(x):
x['image_path'] = image_path
return x
encoded_inputs = self._read_encoded_inputs_sample(info_str)
if self.contains_re:
encoded_inputs = self._chunk_re(encoded_inputs)
else:
encoded_inputs = self._chunk_ser(encoded_inputs)
encoded_inputs = list(map(add_imgge_path, encoded_inputs))
return encoded_inputs
def _read_encoded_inputs_sample(self, info_str):
"""
parse label info
"""
# read text info
info_dict = json.loads(info_str)
height = info_dict["height"]
width = info_dict["width"]
words_list = []
bbox_list = []
input_ids_list = []
token_type_ids_list = []
gt_label_list = []
if self.contains_re:
# for re
entities = []
relations = []
id2label = {}
entity_id_to_index_map = {}
empty_entity = set()
for info in info_dict["ocr_info"]:
if self.contains_re:
# for re
if len(info["text"]) == 0:
empty_entity.add(info["id"])
continue
id2label[info["id"]] = info["label"]
relations.extend([tuple(sorted(l)) for l in info["linking"]])
# x1, y1, x2, y2
bbox = info["bbox"]
label = info["label"]
bbox[0] = int(bbox[0] * 1000.0 / width)
bbox[2] = int(bbox[2] * 1000.0 / width)
bbox[1] = int(bbox[1] * 1000.0 / height)
bbox[3] = int(bbox[3] * 1000.0 / height)
text = info["text"]
encode_res = self.tokenizer.encode(
text, pad_to_max_seq_len=False, return_attention_mask=True)
gt_label = []
if not self.add_special_ids:
# TODO: use tok.all_special_ids to remove
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:
-1]
encode_res["attention_mask"] = encode_res["attention_mask"][1:
-1]
if label.lower() == "other":
gt_label.extend([0] * len(encode_res["input_ids"]))
else:
gt_label.append(self.label2id_map[("b-" + label).upper()])
gt_label.extend([self.label2id_map[("i-" + label).upper()]] *
(len(encode_res["input_ids"]) - 1))
if self.contains_re:
if gt_label[0] != self.label2id_map["O"]:
entity_id_to_index_map[info["id"]] = len(entities)
entities.append({
"start": len(input_ids_list),
"end":
len(input_ids_list) + len(encode_res["input_ids"]),
"label": label.upper(),
})
input_ids_list.extend(encode_res["input_ids"])
token_type_ids_list.extend(encode_res["token_type_ids"])
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
gt_label_list.extend(gt_label)
words_list.append(text)
encoded_inputs = {
"input_ids": input_ids_list,
"labels": gt_label_list,
"token_type_ids": token_type_ids_list,
"bbox": bbox_list,
"attention_mask": [1] * len(input_ids_list),
# "words_list": words_list,
}
encoded_inputs = self.pad_sentences(
encoded_inputs,
max_seq_len=self.max_seq_len,
return_attention_mask=self.return_attention_mask)
encoded_inputs = self.truncate_inputs(encoded_inputs)
if self.contains_re:
relations = self._relations(entities, relations, id2label,
empty_entity, entity_id_to_index_map)
encoded_inputs['relations'] = relations
encoded_inputs['entities'] = entities
return encoded_inputs
def _chunk_ser(self, encoded_inputs):
encoded_inputs_all = []
seq_len = len(encoded_inputs['input_ids'])
chunk_size = 512
for chunk_id, index in enumerate(range(0, seq_len, chunk_size)):
chunk_beg = index
chunk_end = min(index + chunk_size, seq_len)
encoded_inputs_example = {}
for key in encoded_inputs:
encoded_inputs_example[key] = encoded_inputs[key][chunk_beg:
chunk_end]
encoded_inputs_all.append(encoded_inputs_example)
return encoded_inputs_all
def _chunk_re(self, encoded_inputs):
# prepare data
entities = encoded_inputs.pop('entities')
relations = encoded_inputs.pop('relations')
encoded_inputs_all = []
chunk_size = 512
for chunk_id, index in enumerate(
range(0, len(encoded_inputs["input_ids"]), chunk_size)):
item = {}
for k in encoded_inputs:
item[k] = encoded_inputs[k][index:index + chunk_size]
# select entity in current chunk
entities_in_this_span = []
global_to_local_map = {} #
for entity_id, entity in enumerate(entities):
if (index <= entity["start"] < index + chunk_size and
index <= entity["end"] < index + chunk_size):
entity["start"] = entity["start"] - index
entity["end"] = entity["end"] - index
global_to_local_map[entity_id] = len(entities_in_this_span)
entities_in_this_span.append(entity)
# select relations in current chunk
relations_in_this_span = []
for relation in relations:
if (index <= relation["start_index"] < index + chunk_size and
index <= relation["end_index"] < index + chunk_size):
relations_in_this_span.append({
"head": global_to_local_map[relation["head"]],
"tail": global_to_local_map[relation["tail"]],
"start_index": relation["start_index"] - index,
"end_index": relation["end_index"] - index,
})
item.update({
"entities": reformat(entities_in_this_span),
"relations": reformat(relations_in_this_span),
})
item['entities']['label'] = [
self.entities_labels[x] for x in item['entities']['label']
]
encoded_inputs_all.append(item)
return encoded_inputs_all
def _relations(self, entities, relations, id2label, empty_entity,
entity_id_to_index_map):
"""
build relations
"""
relations = list(set(relations))
relations = [
rel for rel in relations
if rel[0] not in empty_entity and rel[1] not in empty_entity
]
kv_relations = []
for rel in relations:
pair = [id2label[rel[0]], id2label[rel[1]]]
if pair == ["question", "answer"]:
kv_relations.append({
"head": entity_id_to_index_map[rel[0]],
"tail": entity_id_to_index_map[rel[1]]
})
elif pair == ["answer", "question"]:
kv_relations.append({
"head": entity_id_to_index_map[rel[1]],
"tail": entity_id_to_index_map[rel[0]]
})
else:
continue
relations = sorted(
[{
"head": rel["head"],
"tail": rel["tail"],
"start_index": get_relation_span(rel, entities)[0],
"end_index": get_relation_span(rel, entities)[1],
} for rel in kv_relations],
key=lambda x: x["head"], )
return relations
def load_img(self, image_path):
# read img
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
resize_h, resize_w = self.img_size
im_shape = img.shape[0:2]
im_scale_y = resize_h / im_shape[0]
im_scale_x = resize_w / im_shape[1]
img_new = cv2.resize(
img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=2)
mean = np.array([0.485, 0.456, 0.406])[np.newaxis, np.newaxis, :]
std = np.array([0.229, 0.224, 0.225])[np.newaxis, np.newaxis, :]
img_new = img_new / 255.0
img_new -= mean
img_new /= std
img = img_new.transpose((2, 0, 1))
return img
def __getitem__(self, idx):
if self.load_mode == "all":
data = copy.deepcopy(self.encoded_inputs_all[idx])
else:
data = self._parse_label_file(self.all_lines[idx])[0]
image_path = data.pop('image_path')
data["image"] = self.load_img(image_path)
return_data = {}
for k, v in data.items():
if k in self.return_keys:
if self.return_keys[k] == 'np':
v = np.array(v)
return_data[k] = v
return return_data
def __len__(self, ):
if self.load_mode == "all":
return len(self.encoded_inputs_all)
else:
return len(self.all_lines)
def get_relation_span(rel, entities):
bound = []
for entity_index in [rel["head"], rel["tail"]]:
bound.append(entities[entity_index]["start"])
bound.append(entities[entity_index]["end"])
return min(bound), max(bound)
def reformat(data):
new_data = {}
for item in data:
for k, v in item.items():
if k not in new_data:
new_data[k] = []
new_data[k].append(v)
return new_data

View File

@ -30,6 +30,7 @@ function func_set_params(){
function func_parser_params(){
strs=$1
MODE=$2
IFS=":"
array=(${strs})
key=${array[0]}

View File

@ -0,0 +1,19 @@
===========================ch_PP-OCRv2===========================
model_name:ch_PP-OCRv2
python:python3.7
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_export:null
infer_quant:False
inference:tools/infer/predict_system.py
--use_gpu:False|True
--enable_mkldnn:False|True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
--rec_model_dir:./inference/ch_PP-OCRv2_rec_infer/
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn system
runtime_device:ARM_CPU
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
rec_infer_model:ch_PP-OCRv2_rec_infer|ch_PP-OCRv2_rec_slim_quant_infer
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
--cpu_threads:1|4
--det_batch_size:1
--rec_batch_size:1
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
--rec_dict_dir:./ppocr_keys_v1.txt
--benchmark:True

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn system
runtime_device:ARM_GPU_OPENCL
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
rec_infer_model:ch_PP-OCRv2_rec_infer|ch_PP-OCRv2_rec_slim_quant_infer
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
--cpu_threads:1|4
--det_batch_size:1
--rec_batch_size:1
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
--rec_dict_dir:./ppocr_keys_v1.txt
--benchmark:True

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn det
runtime_device:ARM_CPU
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
null:null
null:null
--cpu_threads:1|4
--det_batch_size:1
null:null
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
null:null
--benchmark:True

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn det
runtime_device:ARM_GPU_OPENCL
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
null:null
null:null
--cpu_threads:1|4
--det_batch_size:1
null:null
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
null:null
--benchmark:True

View File

@ -1,20 +1,20 @@
===========================train_params===========================
model_name:PPOCRv2_ocr_det
model_name:ch_PPOCRv2_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_infer=1|whole_train_infer=500
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train|pact_train
norm_train:tools/train.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
norm_train:tools/train.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
fpgm_train:null
distill_train:null
null:null
@ -27,8 +27,8 @@ null:null
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
fpgm_export:
distill_export:null
export1:null

View File

@ -0,0 +1,21 @@
===========================kl_quant_params===========================
model_name:PPOCRv2_ocr_det_kl
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
--use_gpu:False|True
--enable_mkldnn:True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:PPOCRv2_ocr_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:pact_train
norm_train:null
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:null
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
fpgm_export:
distill_export:null
export1:null
export2:null
inference_dir:Student
infer_model:./inference/ch_PP-OCRv2_det_infer/
infer_export:null
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,159 @@
Global:
debug: false
use_gpu: true
epoch_num: 800
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_pp-OCRv2_distillation
save_epoch_step: 3
eval_batch_step: [0, 2000]
cal_metric_during_train: true
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: false
infer_img: doc/imgs_words/ch/word_1.jpg
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: false
use_space_char: true
distributed: true
save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Piecewise
decay_epochs : [700, 800]
values : [0.001, 0.0001]
warmup_epoch: 5
regularizer:
name: L2
factor: 2.0e-05
Architecture:
model_type: &model_type "rec"
name: DistillationModel
algorithm: Distillation
Models:
Teacher:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
Student:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
Loss:
name: CombinedLoss
loss_config_list:
- DistillationCTCLoss:
weight: 1.0
model_name_list: ["Student", "Teacher"]
key: head_out
- DistillationDMLLoss:
weight: 1.0
act: "softmax"
use_log: true
model_name_pairs:
- ["Student", "Teacher"]
key: head_out
- DistillationDistanceLoss:
weight: 1.0
mode: "l2"
model_name_pairs:
- ["Student", "Teacher"]
key: backbone_out
PostProcess:
name: DistillationCTCLabelDecode
model_name: ["Student", "Teacher"]
key: head_out
Metric:
name: DistillationMetric
base_metric_name: RecMetric
main_indicator: acc
key: "Student"
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/ic15_data/
label_file_list:
- ./train_data/ic15_data/rec_gt_train.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- RecAug:
- CTCLabelEncode:
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys:
- image
- label
- length
loader:
shuffle: true
batch_size_per_card: 128
drop_last: true
num_sections: 1
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data/ic15_data
label_file_list:
- ./train_data/ic15_data/rec_gt_test.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- CTCLabelEncode:
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys:
- image
- label
- length
loader:
shuffle: false
drop_last: false
batch_size_per_card: 128
num_workers: 8

View File

@ -0,0 +1,53 @@
===========================train_params===========================
model_name:PPOCRv2_ocr_rec
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./inference/rec_inference
null:null
##
trainer:norm_train
norm_train:tools/train.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
quant_export:
fpgm_export:
distill_export:null
export1:null
export2:null
inference_dir:Student
infer_model:./inference/ch_PP-OCRv2_rec_infer/
infer_export:null
infer_quant:False
inference:tools/infer/predict_rec.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--rec_model_dir:
--image_dir:/inference/rec_inference
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,21 @@
===========================kl_quant_params===========================
model_name:PPOCRv2_ocr_rec_kl
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_PP-OCRv2_rec_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
infer_quant:True
inference:tools/infer/predict_rec.py
--use_gpu:False|True
--enable_mkldnn:False|True
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:True
--precision:int8
--rec_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,53 @@
===========================train_params===========================
model_name:PPOCRv2_ocr_rec_pact
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./inference/rec_inference
null:null
##
trainer:pact_train
norm_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
quant_export:
fpgm_export:
distill_export:null
export1:null
export2:null
inference_dir:Student
infer_model:./inference/ch_PP-OCRv2_rec_infer/
infer_export:null
infer_quant:True
inference:tools/infer/predict_rec.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--rec_model_dir:
--image_dir:/inference/rec_inference
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ocr_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:fpgm_train
norm_train:null
pact_train:null
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:null
infer_export:null
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,19 @@
===========================ch_ppocr_mobile_v2.0===========================
model_name:ch_ppocr_mobile_v2.0
python:python3.7
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
infer_export:null
infer_quant:False
inference:tools/infer/predict_system.py
--use_gpu:False|True
--enable_mkldnn:False|True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
--rec_model_dir:./inference/ch_ppocr_mobile_v2.0_rec_infer/
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn system
runtime_device:ARM_CPU
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
rec_infer_model:ch_ppocr_mobile_v2.0_rec_infer|ch_ppocr_mobile_v2.0_rec_slim_infer
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
--cpu_threads:1|4
--det_batch_size:1
--rec_batch_size:1
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
--rec_dict_dir:./ppocr_keys_v1.txt
--benchmark:True

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn system
runtime_device:ARM_GPU_OPENCL
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
rec_infer_model:ch_ppocr_mobile_v2.0_rec_infer|ch_ppocr_mobile_v2.0_rec_slim_infer
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
--cpu_threads:1|4
--det_batch_size:1
--rec_batch_size:1
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
--rec_dict_dir:./ppocr_keys_v1.txt
--benchmark:True

View File

@ -1,12 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn det
infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
runtime_device:ARM_CPU
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
null:null
null:null
--cpu_threads:1|4
--det_batch_size:1
--rec_batch_size:1
--system_batch_size:1
null:null
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
--rec_dict_dir:./ppocr_keys_v1.txt
null:null
--benchmark:True

View File

@ -0,0 +1,13 @@
===========================lite_params===========================
inference:./ocr_db_crnn det
runtime_device:ARM_GPU_OPENCL
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
null:null
null:null
--cpu_threads:1|4
--det_batch_size:1
null:null
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
--config_dir:./config.txt
null:null
--benchmark:True

View File

@ -15,4 +15,4 @@ op.det.local_service_conf.thread_num:1|6
op.det.local_service_conf.use_trt:False|True
op.det.local_service_conf.precision:fp32|fp16|int8
pipline:pipeline_rpc_client.py|pipeline_http_client.py
--image_dir:../../doc/imgs
--image_dir:../../doc/imgs

View File

@ -1,10 +1,10 @@
===========================train_params===========================
model_name:ocr_det
model_name:ch_ppocr_mobile_v2.0_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
@ -12,10 +12,10 @@ train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train|pact_train|fpgm_train
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
trainer:norm_train
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
@ -27,9 +27,9 @@ null:null
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null

View File

@ -13,9 +13,9 @@ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train|pact_train|fpgm_train
norm_train:tools/train.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/det_mv3_db.yml -o
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
distill_train:null
null:null
null:null
@ -27,9 +27,9 @@ null:null
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/det_mv3_db.yml -o
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
distill_export:null
export1:null
export2:null

View File

@ -13,9 +13,9 @@ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train|pact_train|fpgm_train
norm_train:tools/train.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/det_mv3_db.yml -o
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
distill_train:null
null:null
null:null
@ -27,9 +27,9 @@ null:null
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/det_mv3_db.yml -o
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
distill_export:null
export1:null
export2:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ocr_det
python:python
gpu_list:-1
Global.use_gpu:False
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy
infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:False
--enable_mkldnn:False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,52 @@
===========================train_params===========================
model_name:ocr_det
python:python
gpu_list:0
Global.use_gpu:True
Global.auto_cast:fp32|amp
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy
infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,21 @@
===========================kl_quant_params===========================
model_name:ch_ppocr_mobile_v2.0_det_KL
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
--use_gpu:False|True
--enable_mkldnn:True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,17 @@
===========================kl_quant_params===========================
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
--use_gpu:False
--enable_mkldnn:False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,17 @@
===========================kl_quant_params===========================
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
infer_quant:True
inference:tools/infer/predict_det.py
--use_gpu:False
--enable_mkldnn:False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ocr_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:pact_train
norm_train:null
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:null
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
fpgm_export:null
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:null
infer_export:null
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ch_ppocr_mobile_v2.0_rec
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_infer=2|whole_train_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_infer=128|whole_train_infer=128
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./inference/rec_inference
null:null
##
trainer:norm_train
norm_train:tools/train.py -c configs/rec/rec_icdar15_train.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c configs/rec/rec_icdar15_train.yml -o
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c configs/rec/rec_icdar15_train.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
infer_model:null
infer_export:tools/export_model.py -c configs/rec/rec_icdar15_train.yml -o
infer_quant:False
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE"
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:True|False
--precision:fp32|fp16|int8
--rec_model_dir:
--image_dir:./inference/rec_inference
--save_log_path:./test/output/
--benchmark:True
null:null

View File

@ -0,0 +1,102 @@
Global:
use_gpu: true
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_chinese_lite_v2.0
save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Cosine
learning_rate: 0.001
regularizer:
name: 'L2'
factor: 0.00001
Architecture:
model_type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride: [1, 2, 2, 2]
disable_se: True
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 48
Head:
name: CTCHead
fc_decay: 0.00001
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RecAug:
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ch_ppocr_mobile_v2.0_rec_FPGM
python:python3.7
gpu_list:0
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/ic15_data/test/word_1.png
null:null
##
trainer:fpgm_train
norm_train:null
pact_train:null
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./pretrain_models/ch_ppocr_mobile_v2.0_rec_train/best_accuracy
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:null
quant_export:null
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:null
infer_export:null
infer_quant:False
inference:tools/infer/predict_rec.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|int8
--rec_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,21 @@
===========================kl_quant_params===========================
model_name:ch_ppocr_mobile_v2.0_rec_KL
python:python3.7
Global.pretrained_model:null
Global.save_inference_dir:null
infer_model:./inference/ch_ppocr_mobile_v2.0_rec_infer/
infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_KL/rec_chinese_lite_train_v2.0.yml -o
infer_quant:True
inference:tools/infer/predict_rec.py
--use_gpu:False|True
--enable_mkldnn:True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:int8
--det_model_dir:
--image_dir:./inference/rec_inference
null:null
--benchmark:True
null:null
null:null

View File

@ -0,0 +1,101 @@
Global:
use_gpu: true
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_chinese_lite_v2.0
save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Cosine
learning_rate: 0.001
regularizer:
name: 'L2'
factor: 0.00001
Architecture:
model_type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride: [1, 2, 2, 2]
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 48
Head:
name: CTCHead
fc_decay: 0.00001
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RecAug:
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8

View File

@ -0,0 +1,101 @@
Global:
use_gpu: true
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_chinese_lite_v2.0
save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words/ch/word_1.jpg
# for data or label process
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Cosine
learning_rate: 0.001
regularizer:
name: 'L2'
factor: 0.00001
Architecture:
model_type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride: [1, 2, 2, 2]
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 48
Head:
name: CTCHead
fc_decay: 0.00001
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- RecAug:
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: train_data/ic15_data
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- CTCLabelEncode: # Class handling label
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 8

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ch_ppocr_mobile_v2.0_rec_PACT
python:python3.7
gpu_list:0
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
Global.checkpoints:null
train_model_name:latest
train_infer_img_dir:./train_data/ic15_data/test/word_1.png
null:null
##
trainer:pact_train
norm_train:null
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_PACT/rec_chinese_lite_train_v2.0.yml -o
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.checkpoints:
norm_export:null
quant_export:deploy/slim/quantization/export_model.py -ctest_tipc/configs/ch_ppocr_mobile_v2.0_rec_PACT/rec_chinese_lite_train_v2.0.yml -o
fpgm_export:null
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:null
infer_export:null
infer_quant:False
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_image_shape="3,32,100"
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--rec_model_dir:
--image_dir:./inference/rec_inference
--save_log_path:./test/output/
--benchmark:True
null:null

View File

@ -0,0 +1,19 @@
===========================ch_ppocr_server_v2.0===========================
model_name:ch_ppocr_server_v2.0
python:python3.7
infer_model:./inference/ch_ppocr_server_v2.0_det_infer/
infer_export:null
infer_quant:True
inference:tools/infer/predict_system.py
--use_gpu:False|True
--enable_mkldnn:False|True
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False
--precision:fp32
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
--rec_model_dir:./inference/ch_ppocr_server_v2.0_rec_infer/
--benchmark:True
null:null
null:null

View File

@ -1,12 +1,12 @@
===========================train_params===========================
model_name:ocr_server_det
model_name:ch_ppocr_server_v2.0_det
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_infer=2|whole_train_infer=300
Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_lite_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:ch_ppocr_server_v2.0_rec
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=100
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./inference/rec_inference
null:null
##
trainer:norm_train
norm_train:tools/train.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:tools/eval.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
infer_model:null
infer_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
infer_quant:False
inference:tools/infer/predict_rec.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1|6
--use_tensorrt:True|False
--precision:fp32|int8
--rec_model_dir:
--image_dir:./inference/rec_inference
--save_log_path:./test/output/
--benchmark:True
null:null

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:det_mv3_db_v2.0
python:python3.7
gpu_list:0|0,1
Global.use_gpu:True|True
Global.auto_cast:null
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train
norm_train:tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
inference_dir:null
train_model:./inference/det_mv3_db_v2.0_train/best_accuracy
infer_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
null:null
--benchmark:True
null:null

View File

@ -0,0 +1,109 @@
Global:
use_gpu: true
epoch_num: 10000
log_smooth_window: 20
print_batch_step: 2
save_model_dir: ./output/east_mv3/
save_epoch_step: 1000
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000]
cal_metric_during_train: False
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img:
save_res_path: ./output/det_east/predicts_east.txt
Architecture:
model_type: det
algorithm: EAST
Transform:
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
Neck:
name: EASTFPN
model_name: small
Head:
name: EASTHead
model_name: small
Loss:
name: EASTLoss
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
# name: Cosine
learning_rate: 0.001
# warmup_epoch: 0
regularizer:
name: 'L2'
factor: 0
PostProcess:
name: EASTPostProcess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2
Metric:
name: DetMetric
main_indicator: hmean
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [1.0]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- EASTProcessTrain:
image_shape: [512, 512]
background_ratio: 0.125
min_crop_side_ratio: 0.1
min_text_size: 10
- KeepKeys:
keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order
loader:
shuffle: True
drop_last: False
batch_size_per_card: 16
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
limit_side_len: 2400
limit_type: max
- NormalizeImage:
scale: 1./255.
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
loader:
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 2

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:det_mv3_east_v2.0
python:python3.7
gpu_list:0
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train
norm_train:tools/train.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
train_model:./inference/det_mv3_east/best_accuracy
infer_export:tools/export_model.py -c test_tipc/cconfigs/det_mv3_east_v2.0/det_mv3_east.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
--save_log_path:null
--benchmark:True
--det_algorithm:EAST

View File

@ -0,0 +1,135 @@
Global:
use_gpu: true
epoch_num: 600
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/det_mv3_pse/
save_epoch_step: 600
# evaluation is run every 63 iterations
eval_batch_step: [ 0,1000 ]
cal_metric_during_train: False
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_en/img_10.jpg
save_res_path: ./output/det_pse/predicts_pse.txt
Architecture:
model_type: det
algorithm: PSE
Transform: null
Backbone:
name: MobileNetV3
scale: 0.5
model_name: large
Neck:
name: FPN
out_channels: 96
Head:
name: PSEHead
hidden_dim: 96
out_channels: 7
Loss:
name: PSELoss
alpha: 0.7
ohem_ratio: 3
kernel_sample_mask: pred
reduction: none
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Step
learning_rate: 0.001
step_size: 200
gamma: 0.1
regularizer:
name: 'L2'
factor: 0.0005
PostProcess:
name: PSEPostProcess
thresh: 0
box_thresh: 0.85
min_area: 16
box_type: box # 'box' or 'poly'
scale: 1
Metric:
name: DetMetric
main_indicator: hmean
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
ratio_list: [ 1.0 ]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- ColorJitter:
brightness: 0.12549019607843137
saturation: 0.5
- IaaAugment:
augmenter_args:
- { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } }
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
- { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } }
- MakePseGt:
kernel_num: 7
min_shrink_ratio: 0.4
size: 640
- RandomCropImgMask:
size: [ 640,640 ]
main_key: gt_text
crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ]
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list
loader:
shuffle: True
drop_last: False
batch_size_per_card: 16
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data/icdar2015/text_localization/
label_file_list:
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
ratio_list: [ 1.0 ]
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- DetLabelEncode: # Class handling label
- DetResizeForTest:
limit_side_len: 736
limit_type: min
- NormalizeImage:
scale: 1./255.
mean: [ 0.485, 0.456, 0.406 ]
std: [ 0.229, 0.224, 0.225 ]
order: 'hwc'
- ToCHWImage:
- KeepKeys:
keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ]
loader:
shuffle: False
drop_last: False
batch_size_per_card: 1 # must be 1
num_workers: 8

View File

@ -0,0 +1,51 @@
===========================train_params===========================
model_name:det_mv3_pse_v2.0
python:python3.7
gpu_list:0
Global.use_gpu:True|True
Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null
train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null
##
trainer:norm_train
norm_train:tools/train.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
pact_train:null
fpgm_train:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
Global.save_inference_dir:./output/
Global.pretrained_model:
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
train_model:./inference/det_mv3_pse/best_accuracy
infer_export:tools/export_model.py -c test_tipc/cconfigs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
infer_quant:False
inference:tools/infer/predict_det.py
--use_gpu:True|False
--enable_mkldnn:True|False
--cpu_threads:1|6
--rec_batch_num:1
--use_tensorrt:False|True
--precision:fp32|fp16|int8
--det_model_dir:
--image_dir:./inference/ch_det_data_50/all-sum-510/
--save_log_path:null
--benchmark:True
--det_algorithm:PSE

Some files were not shown because too many files have changed in this diff Show More