Merge branch 'PaddlePaddle:dygraph' into dygraph
commit
2945abd703
|
@ -5,28 +5,36 @@ set -xe
|
|||
function _set_params(){
|
||||
run_mode=${1:-"sp"} # 单卡sp|多卡mp
|
||||
batch_size=${2:-"64"}
|
||||
fp_item=${3:-"fp32"} # fp32|fp16
|
||||
max_iter=${4:-"10"} # 可选,如果需要修改代码提前中断
|
||||
model_name=${5:-"model_name"}
|
||||
fp_item=${3:-"fp32"} # fp32|fp16
|
||||
max_epoch=${4:-"10"} # 可选,如果需要修改代码提前中断
|
||||
model_item=${5:-"model_item"}
|
||||
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
|
||||
|
||||
# 日志解析所需参数
|
||||
base_batch_size=${batch_size}
|
||||
mission_name="OCR"
|
||||
direction_id="0"
|
||||
ips_unit="images/sec"
|
||||
skip_steps=2 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填)
|
||||
keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
|
||||
index="1"
|
||||
model_name=${model_item}_bs${batch_size}_${fp_item} # model_item 用于yml文件名匹配,model_name 用于数据入库前端展示
|
||||
# 以下不用修改
|
||||
device=${CUDA_VISIBLE_DEVICES//,/ }
|
||||
arr=(${device})
|
||||
num_gpu_devices=${#arr[*]}
|
||||
log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
|
||||
log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
|
||||
}
|
||||
function _train(){
|
||||
echo "Train on ${num_gpu_devices} GPUs"
|
||||
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
|
||||
|
||||
train_cmd="-c configs/det/${model_name}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_iter} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
|
||||
train_cmd="-c configs/det/${model_item}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_epoch} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
|
||||
case ${run_mode} in
|
||||
sp)
|
||||
train_cmd="python3.7 tools/train.py "${train_cmd}""
|
||||
train_cmd="python tools/train.py "${train_cmd}""
|
||||
;;
|
||||
mp)
|
||||
train_cmd="python3.7 -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
|
||||
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
|
||||
;;
|
||||
*) echo "choose run_mode(sp or mp)"; exit 1;
|
||||
esac
|
||||
|
@ -46,17 +54,7 @@ function _train(){
|
|||
fi
|
||||
}
|
||||
|
||||
function _analysis_log(){
|
||||
analysis_cmd="python3.7 benchmark/analysis.py --filename ${log_file} --mission_name ${model_name} --run_mode ${run_mode} --direction_id 0 --keyword 'ips:' --base_batch_size ${batch_size} --skip_steps 1 --gpu_num ${num_gpu_devices} --index 1 --model_mode=-1 --ips_unit=samples/sec"
|
||||
eval $analysis_cmd
|
||||
}
|
||||
|
||||
function _kill_process(){
|
||||
kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'`
|
||||
}
|
||||
|
||||
|
||||
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
|
||||
_set_params $@
|
||||
_train
|
||||
_analysis_log
|
||||
_kill_process
|
||||
#_train # 如果只想产出训练log,不解析,可取消注释
|
||||
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
|
||||
|
|
|
@ -1,25 +1,35 @@
|
|||
#!/bin/bash
|
||||
# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
|
||||
# 执行目录: ./PaddleOCR
|
||||
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
|
||||
python3.7 -m pip install -r requirements.txt
|
||||
log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
|
||||
python -m pip install -r requirements.txt
|
||||
# 2 拷贝该模型需要数据、预训练模型
|
||||
wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
|
||||
# 3 批量运行(如不方便批量,1,2需放到单个模型中)
|
||||
|
||||
model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse)
|
||||
fp_item_list=(fp32)
|
||||
bs_list=(8 16)
|
||||
for model_mode in ${model_mode_list[@]}; do
|
||||
for fp_item in ${fp_item_list[@]}; do
|
||||
if [ ${model_mode} == "det_r50_vd_east" ]; then
|
||||
bs_list=(16)
|
||||
else
|
||||
bs_list=(8 16)
|
||||
fi
|
||||
for bs_item in ${bs_list[@]}; do
|
||||
echo "index is speed, 1gpus, begin, ${model_name}"
|
||||
run_mode=sp
|
||||
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} # (5min)
|
||||
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
|
||||
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 1 ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
|
||||
sleep 60
|
||||
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
|
||||
run_mode=mp
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode}
|
||||
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
|
||||
sleep 60
|
||||
done
|
||||
done
|
||||
|
|
|
@ -62,8 +62,7 @@ Loss:
|
|||
weight: 0.05
|
||||
num_classes: 6625
|
||||
feat_dim: 96
|
||||
init_center: false
|
||||
center_file_path: "./train_center.pkl"
|
||||
center_file_path:
|
||||
# you can also try to add ace loss on your own dataset
|
||||
# - ACELoss:
|
||||
# weight: 0.1
|
||||
|
|
|
@ -34,10 +34,10 @@ PaddleOCR模型部署。
|
|||
|
||||
* 首先需要从opencv官网上下载在Linux环境下源码编译的包,以opencv3.4.7为例,下载命令如下。
|
||||
|
||||
```
|
||||
```bash
|
||||
cd deploy/cpp_infer
|
||||
wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
|
||||
tar -xf 3.4.7.tar.gz
|
||||
wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz
|
||||
tar -xf opencv-3.4.7.tar.gz
|
||||
```
|
||||
|
||||
最终可以在当前目录下看到`opencv-3.4.7/`的文件夹。
|
||||
|
@ -45,12 +45,13 @@ tar -xf 3.4.7.tar.gz
|
|||
* 编译opencv,设置opencv源码路径(`root_path`)以及安装路径(`install_path`)。进入opencv源码路径下,按照下面的方式进行编译。
|
||||
|
||||
```shell
|
||||
root_path=your_opencv_root_path
|
||||
root_path="your_opencv_root_path"
|
||||
install_path=${root_path}/opencv3
|
||||
build_dir=${root_path}/build
|
||||
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
rm -rf ${build_dir}
|
||||
mkdir ${build_dir}
|
||||
cd ${build_dir}
|
||||
|
||||
cmake .. \
|
||||
-DCMAKE_INSTALL_PREFIX=${install_path} \
|
||||
|
@ -74,6 +75,11 @@ make -j
|
|||
make install
|
||||
```
|
||||
|
||||
也可以直接修改`tools/build_opencv.sh`的内容,然后直接运行下面的命令进行编译。
|
||||
|
||||
```shell
|
||||
sh tools/build_opencv.sh
|
||||
```
|
||||
|
||||
其中`root_path`为下载的opencv源码路径,`install_path`为opencv的安装路径,`make install`完成之后,会在该文件夹下生成opencv头文件和库文件,用于后面的OCR代码编译。
|
||||
|
||||
|
@ -233,12 +239,12 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
|
|||
--image_dir=../../doc/imgs/12.jpg
|
||||
```
|
||||
|
||||
更多参数如下:
|
||||
更多支持的可调节参数解释如下:
|
||||
|
||||
- 通用参数
|
||||
|
||||
|参数名称|类型|默认参数|意义|
|
||||
| --- | --- | --- | --- |
|
||||
| :---: | :---: | :---: | :---: |
|
||||
|use_gpu|bool|false|是否使用GPU|
|
||||
|gpu_id|int|0|GPU id,使用GPU时有效|
|
||||
|gpu_mem|int|4000|申请的GPU内存|
|
||||
|
@ -248,7 +254,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
|
|||
- 检测模型相关
|
||||
|
||||
|参数名称|类型|默认参数|意义|
|
||||
| --- | --- | --- | --- |
|
||||
| :---: | :---: | :---: | :---: |
|
||||
|det_model_dir|string|-|检测模型inference model地址|
|
||||
|max_side_len|int|960|输入图像长宽大于960时,等比例缩放图像,使得图像最长边为960|
|
||||
|det_db_thresh|float|0.3|用于过滤DB预测的二值化图像,设置为0.-0.3对结果影响不明显|
|
||||
|
@ -260,7 +266,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
|
|||
- 方向分类器相关
|
||||
|
||||
|参数名称|类型|默认参数|意义|
|
||||
| --- | --- | --- | --- |
|
||||
| :---: | :---: | :---: | :---: |
|
||||
|use_angle_cls|bool|false|是否使用方向分类器|
|
||||
|cls_model_dir|string|-|方向分类器inference model地址|
|
||||
|cls_thresh|float|0.9|方向分类器的得分阈值|
|
||||
|
@ -268,7 +274,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir
|
|||
- 识别模型相关
|
||||
|
||||
|参数名称|类型|默认参数|意义|
|
||||
| --- | --- | --- | --- |
|
||||
| :---: | :---: | :---: | :---: |
|
||||
|rec_model_dir|string|-|识别模型inference model地址|
|
||||
|char_list_file|string|../../ppocr/utils/ppocr_keys_v1.txt|字典文件|
|
||||
|
||||
|
|
|
@ -17,10 +17,10 @@ PaddleOCR model deployment.
|
|||
|
||||
* First of all, you need to download the source code compiled package in the Linux environment from the opencv official website. Taking opencv3.4.7 as an example, the download command is as follows.
|
||||
|
||||
```
|
||||
```bash
|
||||
cd deploy/cpp_infer
|
||||
wget https://github.com/opencv/opencv/archive/3.4.7.tar.gz
|
||||
tar -xf 3.4.7.tar.gz
|
||||
wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz
|
||||
tar -xf opencv-3.4.7.tar.gz
|
||||
```
|
||||
|
||||
Finally, you can see the folder of `opencv-3.4.7/` in the current directory.
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
root_path="/paddle/PaddleOCR/deploy/cpp_infer/opencv-3.4.7"
|
||||
install_path=${root_path}/opencv3
|
||||
build_dir=${root_path}/build
|
||||
|
||||
rm -rf ${build_dir}
|
||||
mkdir ${build_dir}
|
||||
cd ${build_dir}
|
||||
|
||||
cmake .. \
|
||||
-DCMAKE_INSTALL_PREFIX=${install_path} \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF \
|
||||
-DWITH_IPP=OFF \
|
||||
-DBUILD_IPP_IW=OFF \
|
||||
-DWITH_LAPACK=OFF \
|
||||
-DWITH_EIGEN=OFF \
|
||||
-DCMAKE_INSTALL_LIBDIR=lib64 \
|
||||
-DWITH_ZLIB=ON \
|
||||
-DBUILD_ZLIB=ON \
|
||||
-DWITH_JPEG=ON \
|
||||
-DBUILD_JPEG=ON \
|
||||
-DWITH_PNG=ON \
|
||||
-DBUILD_PNG=ON \
|
||||
-DWITH_TIFF=ON \
|
||||
-DBUILD_TIFF=ON
|
||||
|
||||
make -j
|
||||
make install
|
|
@ -172,7 +172,10 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
|
|||
cv::Mat resize_img;
|
||||
|
||||
int index = 0;
|
||||
|
||||
std::vector<double> time_info = {0, 0, 0};
|
||||
for (int i = boxes.size() - 1; i >= 0; i--) {
|
||||
auto preprocess_start = std::chrono::steady_clock::now();
|
||||
crop_img = GetRotateCropImage(srcimg, boxes[i]);
|
||||
if (use_direction_classify >= 1) {
|
||||
crop_img = RunClsModel(crop_img, predictor_cls);
|
||||
|
@ -191,7 +194,9 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
|
|||
auto *data0 = input_tensor0->mutable_data<float>();
|
||||
|
||||
NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
|
||||
auto preprocess_end = std::chrono::steady_clock::now();
|
||||
//// Run CRNN predictor
|
||||
auto inference_start = std::chrono::steady_clock::now();
|
||||
predictor_crnn->Run();
|
||||
|
||||
// Get output and run postprocess
|
||||
|
@ -199,8 +204,10 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
|
|||
std::move(predictor_crnn->GetOutput(0)));
|
||||
auto *predict_batch = output_tensor0->data<float>();
|
||||
auto predict_shape = output_tensor0->shape();
|
||||
auto inference_end = std::chrono::steady_clock::now();
|
||||
|
||||
// ctc decode
|
||||
auto postprocess_start = std::chrono::steady_clock::now();
|
||||
std::string str_res;
|
||||
int argmax_idx;
|
||||
int last_index = 0;
|
||||
|
@ -224,7 +231,20 @@ void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
|
|||
score /= count;
|
||||
rec_text.push_back(str_res);
|
||||
rec_text_score.push_back(score);
|
||||
auto postprocess_end = std::chrono::steady_clock::now();
|
||||
|
||||
std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;
|
||||
time_info[0] += double(preprocess_diff.count() * 1000);
|
||||
std::chrono::duration<float> inference_diff = inference_end - inference_start;
|
||||
time_info[1] += double(inference_diff.count() * 1000);
|
||||
std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;
|
||||
time_info[2] += double(postprocess_diff.count() * 1000);
|
||||
|
||||
}
|
||||
|
||||
times->push_back(time_info[0]);
|
||||
times->push_back(time_info[1]);
|
||||
times->push_back(time_info[2]);
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<int>>>
|
||||
|
@ -312,7 +332,6 @@ std::shared_ptr<PaddlePredictor> loadModel(std::string model_file, int num_threa
|
|||
config.set_model_from_file(model_file);
|
||||
|
||||
config.set_threads(num_threads);
|
||||
|
||||
std::shared_ptr<PaddlePredictor> predictor =
|
||||
CreatePaddlePredictor<MobileConfig>(config);
|
||||
return predictor;
|
||||
|
@ -434,6 +453,9 @@ void system(char **argv){
|
|||
auto rec_predictor = loadModel(rec_model_file, std::stoi(num_threads));
|
||||
auto cls_predictor = loadModel(cls_model_file, std::stoi(num_threads));
|
||||
|
||||
std::vector<double> det_time_info = {0, 0, 0};
|
||||
std::vector<double> rec_time_info = {0, 0, 0};
|
||||
|
||||
for (int i = 0; i < cv_all_img_names.size(); ++i) {
|
||||
std::cout << "The predict img: " << cv_all_img_names[i] << std::endl;
|
||||
cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR);
|
||||
|
@ -459,8 +481,38 @@ void system(char **argv){
|
|||
//// print recognized text
|
||||
for (int i = 0; i < rec_text.size(); i++) {
|
||||
std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
|
||||
<< std::endl;
|
||||
<< std::endl;
|
||||
|
||||
}
|
||||
|
||||
det_time_info[0] += det_times[0];
|
||||
det_time_info[1] += det_times[1];
|
||||
det_time_info[2] += det_times[2];
|
||||
rec_time_info[0] += rec_times[0];
|
||||
rec_time_info[1] += rec_times[1];
|
||||
rec_time_info[2] += rec_times[2];
|
||||
}
|
||||
if (strcmp(argv[12], "True") == 0) {
|
||||
AutoLogger autolog_det(det_model_file,
|
||||
runtime_device,
|
||||
std::stoi(num_threads),
|
||||
std::stoi(batchsize),
|
||||
"dynamic",
|
||||
precision,
|
||||
det_time_info,
|
||||
cv_all_img_names.size());
|
||||
AutoLogger autolog_rec(rec_model_file,
|
||||
runtime_device,
|
||||
std::stoi(num_threads),
|
||||
std::stoi(batchsize),
|
||||
"dynamic",
|
||||
precision,
|
||||
rec_time_info,
|
||||
cv_all_img_names.size());
|
||||
|
||||
autolog_det.report();
|
||||
std::cout << std::endl;
|
||||
autolog_rec.report();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -503,15 +555,15 @@ void det(int argc, char **argv) {
|
|||
auto img_vis = Visualization(srcimg, boxes);
|
||||
std::cout << boxes.size() << " bboxes have detected:" << std::endl;
|
||||
|
||||
// for (int i=0; i<boxes.size(); i++){
|
||||
// std::cout << "The " << i << " box:" << std::endl;
|
||||
// for (int j=0; j<4; j++){
|
||||
// for (int k=0; k<2; k++){
|
||||
// std::cout << boxes[i][j][k] << "\t";
|
||||
// }
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
for (int i=0; i<boxes.size(); i++){
|
||||
std::cout << "The " << i << " box:" << std::endl;
|
||||
for (int j=0; j<4; j++){
|
||||
for (int k=0; k<2; k++){
|
||||
std::cout << boxes[i][j][k] << "\t";
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
time_info[0] += times[0];
|
||||
time_info[1] += times[1];
|
||||
time_info[2] += times[2];
|
||||
|
@ -585,6 +637,9 @@ void rec(int argc, char **argv) {
|
|||
std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
|
||||
<< std::endl;
|
||||
}
|
||||
time_info[0] += times[0];
|
||||
time_info[1] += times[1];
|
||||
time_info[2] += times[2];
|
||||
}
|
||||
// TODO: support autolog
|
||||
if (strcmp(argv[9], "True") == 0) {
|
||||
|
|
|
@ -52,12 +52,17 @@ def main(config, device, logger, vdl_writer):
|
|||
config['Architecture']["Head"]['out_channels'] = char_num
|
||||
model = build_model(config['Architecture'])
|
||||
|
||||
flops = paddle.flops(model, [1, 3, 640, 640])
|
||||
logger.info(f"FLOPs before pruning: {flops}")
|
||||
if config['Architecture']['model_type'] == 'det':
|
||||
input_shape = [1, 3, 640, 640]
|
||||
elif config['Architecture']['model_type'] == 'rec':
|
||||
input_shape = [1, 3, 32, 320]
|
||||
|
||||
flops = paddle.flops(model, input_shape)
|
||||
logger.info("FLOPs before pruning: {}".format(flops))
|
||||
|
||||
from paddleslim.dygraph import FPGMFilterPruner
|
||||
model.train()
|
||||
pruner = FPGMFilterPruner(model, [1, 3, 640, 640])
|
||||
pruner = FPGMFilterPruner(model, input_shape)
|
||||
|
||||
# build metric
|
||||
eval_class = build_metric(config['Metric'])
|
||||
|
@ -65,8 +70,13 @@ def main(config, device, logger, vdl_writer):
|
|||
def eval_fn():
|
||||
metric = program.eval(model, valid_dataloader, post_process_class,
|
||||
eval_class)
|
||||
logger.info(f"metric['hmean']: {metric['hmean']}")
|
||||
return metric['hmean']
|
||||
if config['Architecture']['model_type'] == 'det':
|
||||
main_indicator = 'hmean'
|
||||
else:
|
||||
main_indicator = 'acc'
|
||||
logger.info("metric[{}]: {}".format(main_indicator, metric[
|
||||
main_indicator]))
|
||||
return metric[main_indicator]
|
||||
|
||||
params_sensitive = pruner.sensitive(
|
||||
eval_func=eval_fn,
|
||||
|
@ -81,18 +91,22 @@ def main(config, device, logger, vdl_writer):
|
|||
# calculate pruned params's ratio
|
||||
params_sensitive = pruner._get_ratios_by_loss(params_sensitive, loss=0.02)
|
||||
for key in params_sensitive.keys():
|
||||
logger.info(f"{key}, {params_sensitive[key]}")
|
||||
logger.info("{}, {}".format(key, params_sensitive[key]))
|
||||
|
||||
plan = pruner.prune_vars(params_sensitive, [0])
|
||||
|
||||
flops = paddle.flops(model, [1, 3, 640, 640])
|
||||
logger.info(f"FLOPs after pruning: {flops}")
|
||||
flops = paddle.flops(model, input_shape)
|
||||
logger.info("FLOPs after pruning: {}".format(flops))
|
||||
|
||||
# load pretrain model
|
||||
load_model(config, model)
|
||||
metric = program.eval(model, valid_dataloader, post_process_class,
|
||||
eval_class)
|
||||
logger.info(f"metric['hmean']: {metric['hmean']}")
|
||||
if config['Architecture']['model_type'] == 'det':
|
||||
main_indicator = 'hmean'
|
||||
else:
|
||||
main_indicator = 'acc'
|
||||
logger.info("metric['']: {}".format(main_indicator, metric[main_indicator]))
|
||||
|
||||
# start export model
|
||||
from paddle.jit import to_static
|
||||
|
|
|
@ -73,13 +73,18 @@ def main(config, device, logger, vdl_writer):
|
|||
char_num = len(getattr(post_process_class, 'character'))
|
||||
config['Architecture']["Head"]['out_channels'] = char_num
|
||||
model = build_model(config['Architecture'])
|
||||
if config['Architecture']['model_type'] == 'det':
|
||||
input_shape = [1, 3, 640, 640]
|
||||
elif config['Architecture']['model_type'] == 'rec':
|
||||
input_shape = [1, 3, 32, 320]
|
||||
flops = paddle.flops(model, input_shape)
|
||||
|
||||
flops = paddle.flops(model, [1, 3, 640, 640])
|
||||
logger.info("FLOPs before pruning: {}".format(flops))
|
||||
|
||||
from paddleslim.dygraph import FPGMFilterPruner
|
||||
model.train()
|
||||
pruner = FPGMFilterPruner(model, [1, 3, 640, 640])
|
||||
|
||||
pruner = FPGMFilterPruner(model, input_shape)
|
||||
|
||||
# build loss
|
||||
loss_class = build_loss(config['Loss'])
|
||||
|
@ -107,8 +112,14 @@ def main(config, device, logger, vdl_writer):
|
|||
def eval_fn():
|
||||
metric = program.eval(model, valid_dataloader, post_process_class,
|
||||
eval_class, False)
|
||||
logger.info("metric['hmean']: {}".format(metric['hmean']))
|
||||
return metric['hmean']
|
||||
if config['Architecture']['model_type'] == 'det':
|
||||
main_indicator = 'hmean'
|
||||
else:
|
||||
main_indicator = 'acc'
|
||||
|
||||
logger.info("metric[{}]: {}".format(main_indicator, metric[
|
||||
main_indicator]))
|
||||
return metric[main_indicator]
|
||||
|
||||
run_sensitive_analysis = False
|
||||
"""
|
||||
|
@ -149,7 +160,7 @@ def main(config, device, logger, vdl_writer):
|
|||
|
||||
plan = pruner.prune_vars(params_sensitive, [0])
|
||||
|
||||
flops = paddle.flops(model, [1, 3, 640, 640])
|
||||
flops = paddle.flops(model, input_shape)
|
||||
logger.info("FLOPs after pruning: {}".format(flops))
|
||||
|
||||
# start train
|
||||
|
|
|
@ -247,3 +247,7 @@ Q1: 训练模型转inference 模型之后预测效果不一致?
|
|||
**A**:此类问题出现较多,问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。以det_mv3_db.yml配置文件训练的模型为例,训练模型、inference模型预测结果不一致问题解决方式如下:
|
||||
- 检查[trained model预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116),和[inference model的预测预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42)函数是否一致。算法在评估的时候,输入图像大小会影响精度,为了和论文保持一致,训练icdar15配置文件中将图像resize到[736, 1280],但是在inference model预测的时候只有一套默认参数,会考虑到预测速度问题,默认限制图像最长边为960做resize的。训练模型预处理和inference模型的预处理函数位于[ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147)
|
||||
- 检查[trained model后处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51),和[inference 后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50)是否一致。
|
||||
|
||||
Q1: 训练EAST模型提示找不到lanms库?
|
||||
|
||||
**A**:执行pip3 install lanms-nova 即可。
|
||||
|
|
|
@ -34,6 +34,8 @@ inference 模型(`paddle.jit.save`保存的模型)
|
|||
- [1. 超轻量中文OCR模型推理](#超轻量中文OCR模型推理)
|
||||
- [2. 其他模型推理](#其他模型推理)
|
||||
|
||||
- [六、参数解释](#参数解释)
|
||||
|
||||
|
||||
<a name="训练模型转inference模型"></a>
|
||||
## 一、训练模型转inference模型
|
||||
|
@ -394,3 +396,127 @@ python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --d
|
|||
执行命令后,识别结果图像如下:
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
<a name="参数解释"></a>
|
||||
# 六、参数解释
|
||||
|
||||
更多关于预测过程的参数解释如下所示。
|
||||
|
||||
* 全局信息
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| image_dir | str | 无,必须显式指定 | 图像或者文件夹路径 |
|
||||
| vis_font_path | str | "./doc/fonts/simfang.ttf" | 用于可视化的字体路径 |
|
||||
| drop_score | float | 0.5 | 识别得分小于该值的结果会被丢弃,不会作为返回结果 |
|
||||
| use_pdserving | bool | False | 是否使用Paddle Serving进行预测 |
|
||||
| warmup | bool | False | 是否开启warmup,在统计预测耗时的时候,可以使用这种方法 |
|
||||
| draw_img_save_dir | str | "./inference_results" | 系统串联预测OCR结果的保存文件夹 |
|
||||
| save_crop_res | bool | False | 是否保存OCR的识别文本图像 |
|
||||
| crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 |
|
||||
| use_mp | bool | False | 是否开启多进程预测 |
|
||||
| total_process_num | int | 6 | 开启的进城数,`use_mp`为`True`时生效 |
|
||||
| process_id | int | 0 | 当前进程的id号,无需自己修改 |
|
||||
| benchmark | bool | False | 是否开启benchmark,对预测速度、显存占用等进行统计 |
|
||||
| save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 |
|
||||
| show_log | bool | True | 是否显示预测中的日志信息 |
|
||||
| use_onnx | bool | False | 是否开启onnx预测 |
|
||||
|
||||
|
||||
* 预测引擎相关
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| use_gpu | bool | True | 是否使用GPU进行预测 |
|
||||
| ir_optim | bool | True | 是否对计算图进行分析与优化,开启后可以加速预测过程 |
|
||||
| use_tensorrt | bool | False | 是否开启tensorrt |
|
||||
| min_subgraph_size | int | 15 | tensorrt中最小子图size,当子图的size大于该值时,才会尝试对该子图使用trt engine计算 |
|
||||
| precision | str | fp32 | 预测的精度,支持`fp32`, `fp16`, `int8` 3种输入 |
|
||||
| enable_mkldnn | bool | True | 是否开启mkldnn |
|
||||
| cpu_threads | int | 10 | 开启mkldnn时,cpu预测的线程数 |
|
||||
|
||||
* 文本检测模型相关
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE` |
|
||||
| det_model_dir | str | xx | 检测inference模型路径 |
|
||||
| det_limit_side_len | int | 960 | 检测的图像边长限制 |
|
||||
| det_limit_type | str | "max" | 检测的变成限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` |
|
||||
|
||||
其中,DB算法相关参数如下
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| det_db_thresh | float | 0.3 | DB输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 |
|
||||
| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 |
|
||||
| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 |
|
||||
| max_batch_size | int | 10 | 预测的batch size |
|
||||
| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 |
|
||||
| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 |
|
||||
|
||||
EAST算法相关参数如下
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| det_east_score_thresh | float | 0.8 | EAST后处理中score map的阈值 |
|
||||
| det_east_cover_thresh | float | 0.1 | EAST后处理中文本框的平均得分阈值 |
|
||||
| det_east_nms_thresh | float | 0.2 | EAST后处理中nms的阈值 |
|
||||
|
||||
SAST算法相关参数如下
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 |
|
||||
| det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 |
|
||||
| det_sast_polygon | bool | False | 是否多边形检测,弯曲文本场景(如Total-Text)设置为True |
|
||||
|
||||
PSE算法相关参数如下
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 |
|
||||
| det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值,低于此阈值的丢弃 |
|
||||
| det_pse_min_area | float | 16 | box的最小面积,低于此阈值的丢弃 |
|
||||
| det_pse_box_type | str | "box" | 返回框的类型,box:四点坐标,poly: 弯曲文本的所有点坐标 |
|
||||
| det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`,scale为2的情况下,进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 |
|
||||
|
||||
* 文本识别模型相关
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR` |
|
||||
| rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 |
|
||||
| rec_image_shape | list | [3, 32, 320] | 识别时的图像尺寸, |
|
||||
| rec_batch_num | int | 6 | 识别的batch size |
|
||||
| max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 |
|
||||
| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 |
|
||||
| use_space_char | bool | True | 是否包含空格,如果为`True`,则会在最后字符字典中补充`空格`字符 |
|
||||
|
||||
|
||||
* 端到端文本检测与识别模型相关
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| e2e_algorithm | str | "PGNet" | 端到端算法名称,目前支持`PGNet` |
|
||||
| e2e_model_dir | str | 无,如果使用端到端模型,该项是必填项 | 端到端模型inference模型路径 |
|
||||
| e2e_limit_side_len | int | 768 | 端到端的输入图像边长限制 |
|
||||
| e2e_limit_type | str | "max" | 端到端的边长限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`e2e_limit_side_len`,`max`表示保证图像最长边不大于`e2e_limit_side_len` |
|
||||
| e2e_pgnet_score_thresh | float | 0.5 | 端到端得分阈值,小于该阈值的结果会被丢弃 |
|
||||
| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | 识别的字典文件路径 |
|
||||
| e2e_pgnet_valid_set | str | "totaltext" | 验证集名称,目前支持`totaltext`, `partvgg`,不同数据集对应的后处理方式不同,与训练过程保持一致即可 |
|
||||
| e2e_pgnet_mode | str | "fast" | PGNet的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 |
|
||||
|
||||
|
||||
* 方向分类器模型相关
|
||||
|
||||
| 参数名称 | 类型 | 默认值 | 含义 |
|
||||
| :--: | :--: | :--: | :--: |
|
||||
| use_angle_cls | bool | False | 是否使用方向分类器 |
|
||||
| cls_model_dir | str | 无,如果需要使用,则必须显式指定路径 | 方向分类器inference模型路径 |
|
||||
| cls_image_shape | list | [3, 48, 192] | 预测尺度 |
|
||||
| label_list | list | ['0', '180'] | class id对应的角度值 |
|
||||
| cls_batch_num | int | 6 | 方向分类器预测的batch size |
|
||||
| cls_thresh | float | 0.9 | 预测阈值,模型预测结果为180度,且得分大于该阈值时,认为最终预测结果为180度,需要翻转 |
|
||||
|
|
|
@ -33,8 +33,8 @@ PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训
|
|||
|
||||
|模型名称|模型简介|配置文件|推理模型大小|下载地址|
|
||||
| --- | --- | --- | --- | --- |
|
||||
|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|
||||
|ch_PP-OCRv2_det|【最新】原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|
||||
|ch_PP-OCRv2_det_slim|【最新】slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|
||||
|ch_PP-OCRv2_det|【最新】原始超轻量模型,支持中英文、多语种文本检测|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|
||||
|ch_ppocr_mobile_slim_v2.0_det|slim裁剪版超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)| 2.6M |[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|
||||
|ch_ppocr_mobile_v2.0_det|原始超轻量模型,支持中英文、多语种文本检测|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|
||||
|ch_ppocr_server_v2.0_det|通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|
|
||||
|
|
|
@ -66,13 +66,13 @@ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.
|
|||
### 单张图像或者图像集合预测
|
||||
```bash
|
||||
# 预测image_dir指定的单张图像
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
|
||||
|
||||
# 预测image_dir指定的图像集合
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
|
||||
|
||||
# 如果想使用CPU进行预测,需设置use_gpu参数为False
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True --use_gpu=False
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" --use_gpu=False
|
||||
```
|
||||
### 可视化结果
|
||||
可视化文本检测结果默认保存到./inference_results文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下:
|
||||
|
@ -167,9 +167,9 @@ python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img=
|
|||
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar
|
||||
python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e
|
||||
```
|
||||
**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"`**,可以执行如下命令:
|
||||
**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**,可以执行如下命令:
|
||||
```
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=False
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg" --e2e_pgnet_valid_set="totaltext"
|
||||
```
|
||||
可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下:
|
||||
|
||||
|
@ -178,9 +178,9 @@ python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/im
|
|||
#### (2). 弯曲文本检测模型(Total-Text)
|
||||
对于弯曲文本样例
|
||||
|
||||
**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"`,同时,还需要增加参数`--e2e_pgnet_polygon=True`,**可以执行如下命令:
|
||||
**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"`,同时,还需要增加参数`--e2e_pgnet_valid_set="totaltext"`,**可以执行如下命令:
|
||||
```
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext"
|
||||
```
|
||||
可视化文本端到端结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下:
|
||||
|
||||
|
|
|
@ -29,8 +29,8 @@ Relationship of the above models is as follows.
|
|||
|
||||
|model name|description|config|model size|download|
|
||||
| --- | --- | --- | --- | --- |
|
||||
|ch_PP-OCRv2_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|
||||
|ch_PP-OCRv2_det|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|
||||
|ch_PP-OCRv2_det_slim|[New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)|
|
||||
|ch_PP-OCRv2_det|[New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)|
|
||||
|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)|
|
||||
|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)|
|
||||
|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)|
|
||||
|
|
|
@ -59,13 +59,13 @@ After decompression, there should be the following file structure:
|
|||
### Single image or image set prediction
|
||||
```bash
|
||||
# Prediction single image specified by image_dir
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
|
||||
|
||||
# Prediction the collection of images specified by image_dir
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext"
|
||||
|
||||
# If you want to use CPU for prediction, you need to set use_gpu parameter is false
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_polygon=True --use_gpu=False
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --use_gpu=False --e2e_pgnet_valid_set="totaltext"
|
||||
```
|
||||
### Visualization results
|
||||
The visualized end-to-end results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:
|
||||
|
@ -166,9 +166,9 @@ First, convert the model saved in the PGNet end-to-end training process into an
|
|||
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar
|
||||
python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e
|
||||
```
|
||||
**For PGNet quadrangle end-to-end model inference, you need to set the parameter `--e2e_algorithm="PGNet"`**, run the following command:
|
||||
**For PGNet quadrangle end-to-end model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**, run the following command:
|
||||
```
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=False
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg"
|
||||
```
|
||||
The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:
|
||||
|
||||
|
@ -176,9 +176,9 @@ The visualized text detection results are saved to the `./inference_results` fol
|
|||
|
||||
#### (2). Curved text detection model (Total-Text)
|
||||
For the curved text example, we use the same model as the quadrilateral
|
||||
**For PGNet end-to-end curved text detection model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_polygon=True`**, run the following command:
|
||||
**For PGNet end-to-end curved text detection model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="totaltext"`**, run the following command:
|
||||
```
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_polygon=True
|
||||
python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext"
|
||||
```
|
||||
The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows:
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ __all__ = [
|
|||
]
|
||||
|
||||
SUPPORT_DET_MODEL = ['DB']
|
||||
VERSION = '2.3.0.1'
|
||||
VERSION = '2.3.0.2'
|
||||
SUPPORT_REC_MODEL = ['CRNN']
|
||||
BASE_DIR = os.path.expanduser("~/.paddleocr/")
|
||||
|
||||
|
|
|
@ -30,21 +30,17 @@ class CenterLoss(nn.Layer):
|
|||
Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_classes=6625,
|
||||
feat_dim=96,
|
||||
init_center=False,
|
||||
center_file_path=None):
|
||||
def __init__(self, num_classes=6625, feat_dim=96, center_file_path=None):
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.feat_dim = feat_dim
|
||||
self.centers = paddle.randn(
|
||||
shape=[self.num_classes, self.feat_dim]).astype("float64")
|
||||
|
||||
if init_center:
|
||||
if center_file_path is not None:
|
||||
assert os.path.exists(
|
||||
center_file_path
|
||||
), f"center path({center_file_path}) must exist when init_center is set as True."
|
||||
), f"center path({center_file_path}) must exist when it is not None."
|
||||
with open(center_file_path, 'rb') as f:
|
||||
char_dict = pickle.load(f)
|
||||
for key in char_dict.keys():
|
||||
|
|
|
@ -16,7 +16,7 @@ __all__ = ["build_backbone"]
|
|||
|
||||
|
||||
def build_backbone(config, model_type):
|
||||
if model_type == "det":
|
||||
if model_type == "det" or model_type == "table":
|
||||
from .det_mobilenet_v3 import MobileNetV3
|
||||
from .det_resnet_vd import ResNet
|
||||
from .det_resnet_vd_sast import ResNet_SAST
|
||||
|
@ -36,10 +36,6 @@ def build_backbone(config, model_type):
|
|||
elif model_type == "e2e":
|
||||
from .e2e_resnet_vd_pg import ResNet
|
||||
support_dict = ["ResNet"]
|
||||
elif model_type == "table":
|
||||
from .table_resnet_vd import ResNet
|
||||
from .table_mobilenet_v3 import MobileNetV3
|
||||
support_dict = ["ResNet", "MobileNetV3"]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
|
@ -26,8 +26,10 @@ class MobileNetV3(nn.Layer):
|
|||
scale=0.5,
|
||||
large_stride=None,
|
||||
small_stride=None,
|
||||
disable_se=False,
|
||||
**kwargs):
|
||||
super(MobileNetV3, self).__init__()
|
||||
self.disable_se = disable_se
|
||||
if small_stride is None:
|
||||
small_stride = [2, 2, 2, 2]
|
||||
if large_stride is None:
|
||||
|
@ -101,6 +103,7 @@ class MobileNetV3(nn.Layer):
|
|||
block_list = []
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
se = se and not self.disable_se
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
|
|
|
@ -1,287 +0,0 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class MobileNetV3(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels=3,
|
||||
model_name='large',
|
||||
scale=0.5,
|
||||
disable_se=False,
|
||||
**kwargs):
|
||||
"""
|
||||
the MobilenetV3 backbone network for detection module.
|
||||
Args:
|
||||
params(dict): the super parameters for build network
|
||||
"""
|
||||
super(MobileNetV3, self).__init__()
|
||||
|
||||
self.disable_se = disable_se
|
||||
|
||||
if model_name == "large":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, 'relu', 1],
|
||||
[3, 64, 24, False, 'relu', 2],
|
||||
[3, 72, 24, False, 'relu', 1],
|
||||
[5, 72, 40, True, 'relu', 2],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[5, 120, 40, True, 'relu', 1],
|
||||
[3, 240, 80, False, 'hardswish', 2],
|
||||
[3, 200, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 184, 80, False, 'hardswish', 1],
|
||||
[3, 480, 112, True, 'hardswish', 1],
|
||||
[3, 672, 112, True, 'hardswish', 1],
|
||||
[5, 672, 160, True, 'hardswish', 2],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
[5, 960, 160, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 960
|
||||
elif model_name == "small":
|
||||
cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, 'relu', 2],
|
||||
[3, 72, 24, False, 'relu', 2],
|
||||
[3, 88, 24, False, 'relu', 1],
|
||||
[5, 96, 40, True, 'hardswish', 2],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 240, 40, True, 'hardswish', 1],
|
||||
[5, 120, 48, True, 'hardswish', 1],
|
||||
[5, 144, 48, True, 'hardswish', 1],
|
||||
[5, 288, 96, True, 'hardswish', 2],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
[5, 576, 96, True, 'hardswish', 1],
|
||||
]
|
||||
cls_ch_squeeze = 576
|
||||
else:
|
||||
raise NotImplementedError("mode[" + model_name +
|
||||
"_model] is not implemented!")
|
||||
|
||||
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
|
||||
assert scale in supported_scale, \
|
||||
"supported scale are {} but input scale is {}".format(supported_scale, scale)
|
||||
inplanes = 16
|
||||
# conv1
|
||||
self.conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=make_divisible(inplanes * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish',
|
||||
name='conv1')
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
block_list = []
|
||||
i = 0
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in cfg:
|
||||
se = se and not self.disable_se
|
||||
start_idx = 2 if model_name == 'large' else 0
|
||||
if s == 2 and i > start_idx:
|
||||
self.out_channels.append(inplanes)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
block_list = []
|
||||
block_list.append(
|
||||
ResidualUnit(
|
||||
in_channels=inplanes,
|
||||
mid_channels=make_divisible(scale * exp),
|
||||
out_channels=make_divisible(scale * c),
|
||||
kernel_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl,
|
||||
name="conv" + str(i + 2)))
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
block_list.append(
|
||||
ConvBNLayer(
|
||||
in_channels=inplanes,
|
||||
out_channels=make_divisible(scale * cls_ch_squeeze),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act='hardswish',
|
||||
name='conv_last'))
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
|
||||
for i, stage in enumerate(self.stages):
|
||||
self.add_sublayer(sublayer=stage, name="stage{}".format(i))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
out_list = []
|
||||
for stage in self.stages:
|
||||
x = stage(x)
|
||||
out_list.append(x)
|
||||
return out_list
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
if_act=True,
|
||||
act=None,
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.if_act = if_act
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + '_weights'),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm(
|
||||
num_channels=out_channels,
|
||||
act=None,
|
||||
param_attr=ParamAttr(name=name + "_bn_scale"),
|
||||
bias_attr=ParamAttr(name=name + "_bn_offset"),
|
||||
moving_mean_name=name + "_bn_mean",
|
||||
moving_variance_name=name + "_bn_variance")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.if_act:
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "hardswish":
|
||||
x = F.hardswish(x)
|
||||
else:
|
||||
print("The activation function({}) is selected incorrectly.".
|
||||
format(self.act))
|
||||
exit()
|
||||
return x
|
||||
|
||||
|
||||
class ResidualUnit(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
use_se,
|
||||
act=None,
|
||||
name=''):
|
||||
super(ResidualUnit, self).__init__()
|
||||
self.if_shortcut = stride == 1 and in_channels == out_channels
|
||||
self.if_se = use_se
|
||||
|
||||
self.expand_conv = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=True,
|
||||
act=act,
|
||||
name=name + "_expand")
|
||||
self.bottleneck_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=int((kernel_size - 1) // 2),
|
||||
groups=mid_channels,
|
||||
if_act=True,
|
||||
act=act,
|
||||
name=name + "_depthwise")
|
||||
if self.if_se:
|
||||
self.mid_se = SEModule(mid_channels, name=name + "_se")
|
||||
self.linear_conv = ConvBNLayer(
|
||||
in_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
if_act=False,
|
||||
act=None,
|
||||
name=name + "_linear")
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.expand_conv(inputs)
|
||||
x = self.bottleneck_conv(x)
|
||||
if self.if_se:
|
||||
x = self.mid_se(x)
|
||||
x = self.linear_conv(x)
|
||||
if self.if_shortcut:
|
||||
x = paddle.add(inputs, x)
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, in_channels, reduction=4, name=""):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = nn.AdaptiveAvgPool2D(1)
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(name=name + "_1_weights"),
|
||||
bias_attr=ParamAttr(name=name + "_1_offset"))
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=in_channels // reduction,
|
||||
out_channels=in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(name + "_2_weights"),
|
||||
bias_attr=ParamAttr(name=name + "_2_offset"))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
|
||||
return inputs * outputs
|
|
@ -1,280 +0,0 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
__all__ = ["ResNet"]
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
groups=1,
|
||||
is_vd_mode=False,
|
||||
act=None,
|
||||
name=None, ):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.is_vd_mode = is_vd_mode
|
||||
self._pool2d_avg = nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True)
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=(kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(name=name + "_weights"),
|
||||
bias_attr=False)
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
self._batch_norm = nn.BatchNorm(
|
||||
out_channels,
|
||||
act=act,
|
||||
param_attr=ParamAttr(name=bn_name + '_scale'),
|
||||
bias_attr=ParamAttr(bn_name + '_offset'),
|
||||
moving_mean_name=bn_name + '_mean',
|
||||
moving_variance_name=bn_name + '_variance')
|
||||
|
||||
def forward(self, inputs):
|
||||
if self.is_vd_mode:
|
||||
inputs = self._pool2d_avg(inputs)
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
return y
|
||||
|
||||
|
||||
class BottleneckBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BottleneckBlock, self).__init__()
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2b")
|
||||
self.conv2 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
act=None,
|
||||
name=name + "_branch2c")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels * 4,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
conv2 = self.conv2(conv1)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv2)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride,
|
||||
shortcut=True,
|
||||
if_first=False,
|
||||
name=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.conv0 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
name=name + "_branch2a")
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
act=None,
|
||||
name=name + "_branch2b")
|
||||
|
||||
if not shortcut:
|
||||
self.short = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
is_vd_mode=False if if_first else True,
|
||||
name=name + "_branch1")
|
||||
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv0(inputs)
|
||||
conv1 = self.conv1(y)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
y = paddle.add(x=short, y=conv1)
|
||||
y = F.relu(y)
|
||||
return y
|
||||
|
||||
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, in_channels=3, layers=50, **kwargs):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
self.layers = layers
|
||||
supported_layers = [18, 34, 50, 101, 152, 200]
|
||||
assert layers in supported_layers, \
|
||||
"supported layers are {} but input layer is {}".format(
|
||||
supported_layers, layers)
|
||||
|
||||
if layers == 18:
|
||||
depth = [2, 2, 2, 2]
|
||||
elif layers == 34 or layers == 50:
|
||||
depth = [3, 4, 6, 3]
|
||||
elif layers == 101:
|
||||
depth = [3, 4, 23, 3]
|
||||
elif layers == 152:
|
||||
depth = [3, 8, 36, 3]
|
||||
elif layers == 200:
|
||||
depth = [3, 12, 48, 3]
|
||||
num_channels = [64, 256, 512,
|
||||
1024] if layers >= 50 else [64, 64, 128, 256]
|
||||
num_filters = [64, 128, 256, 512]
|
||||
|
||||
self.conv1_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
act='relu',
|
||||
name="conv1_1")
|
||||
self.conv1_2 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=32,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_2")
|
||||
self.conv1_3 = ConvBNLayer(
|
||||
in_channels=32,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
act='relu',
|
||||
name="conv1_3")
|
||||
self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.stages = []
|
||||
self.out_channels = []
|
||||
if layers >= 50:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
if layers in [101, 152] and block == 2:
|
||||
if i == 0:
|
||||
conv_name = "res" + str(block + 2) + "a"
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + "b" + str(i)
|
||||
else:
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
bottleneck_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BottleneckBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block] * 4,
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(bottleneck_block)
|
||||
self.out_channels.append(num_filters[block] * 4)
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
else:
|
||||
for block in range(len(depth)):
|
||||
block_list = []
|
||||
shortcut = False
|
||||
for i in range(depth[block]):
|
||||
conv_name = "res" + str(block + 2) + chr(97 + i)
|
||||
basic_block = self.add_sublayer(
|
||||
'bb_%d_%d' % (block, i),
|
||||
BasicBlock(
|
||||
in_channels=num_channels[block]
|
||||
if i == 0 else num_filters[block],
|
||||
out_channels=num_filters[block],
|
||||
stride=2 if i == 0 and block != 0 else 1,
|
||||
shortcut=shortcut,
|
||||
if_first=block == i == 0,
|
||||
name=conv_name))
|
||||
shortcut = True
|
||||
block_list.append(basic_block)
|
||||
self.out_channels.append(num_filters[block])
|
||||
self.stages.append(nn.Sequential(*block_list))
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.conv1_1(inputs)
|
||||
y = self.conv1_2(y)
|
||||
y = self.conv1_3(y)
|
||||
y = self.pool2d_max(y)
|
||||
out = []
|
||||
for block in self.stages:
|
||||
y = block(y)
|
||||
out.append(y)
|
||||
return out
|
|
@ -53,7 +53,6 @@ class AttentionHead(nn.Layer):
|
|||
output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
|
||||
output = paddle.concat(output_hiddens, axis=1)
|
||||
probs = self.generator(output)
|
||||
|
||||
else:
|
||||
targets = paddle.zeros(shape=[batch_size], dtype="int32")
|
||||
probs = None
|
||||
|
@ -75,6 +74,7 @@ class AttentionHead(nn.Layer):
|
|||
probs_step, axis=1)], axis=1)
|
||||
next_input = probs_step.argmax(axis=1)
|
||||
targets = next_input
|
||||
if not self.training:
|
||||
probs = paddle.nn.functional.softmax(probs, axis=2)
|
||||
return probs
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ def compute_partial_repr(input_points, control_points):
|
|||
1]
|
||||
repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist)
|
||||
# fix numerical error for 0 * log(0), substitute all nan with 0
|
||||
mask = repr_matrix != repr_matrix
|
||||
mask = np.array(repr_matrix != repr_matrix)
|
||||
repr_matrix[mask] = 0
|
||||
return repr_matrix
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ class EASTPostProcess(object):
|
|||
"""
|
||||
The post process for EAST.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
score_thresh=0.8,
|
||||
cover_thresh=0.1,
|
||||
|
@ -38,11 +39,6 @@ class EASTPostProcess(object):
|
|||
self.score_thresh = score_thresh
|
||||
self.cover_thresh = cover_thresh
|
||||
self.nms_thresh = nms_thresh
|
||||
|
||||
# c++ la-nms is faster, but only support python 3.5
|
||||
self.is_python35 = False
|
||||
if sys.version_info.major == 3 and sys.version_info.minor == 5:
|
||||
self.is_python35 = True
|
||||
|
||||
def restore_rectangle_quad(self, origin, geometry):
|
||||
"""
|
||||
|
@ -64,6 +60,7 @@ class EASTPostProcess(object):
|
|||
"""
|
||||
restore text boxes from score map and geo map
|
||||
"""
|
||||
|
||||
score_map = score_map[0]
|
||||
geo_map = np.swapaxes(geo_map, 1, 0)
|
||||
geo_map = np.swapaxes(geo_map, 1, 2)
|
||||
|
@ -79,10 +76,14 @@ class EASTPostProcess(object):
|
|||
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
|
||||
boxes[:, :8] = text_box_restored.reshape((-1, 8))
|
||||
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
|
||||
if self.is_python35:
|
||||
|
||||
try:
|
||||
import lanms
|
||||
boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
|
||||
else:
|
||||
except:
|
||||
print(
|
||||
'you should install lanms by pip3 install lanms-nova to speed up nms_locality'
|
||||
)
|
||||
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
|
||||
if boxes.shape[0] == 0:
|
||||
return []
|
||||
|
@ -139,4 +140,4 @@ class EASTPostProcess(object):
|
|||
continue
|
||||
boxes_norm.append(box)
|
||||
dt_boxes_list.append({'points': np.array(boxes_norm)})
|
||||
return dt_boxes_list
|
||||
return dt_boxes_list
|
||||
|
|
|
@ -54,14 +54,37 @@ def load_model(config, model, optimizer=None):
|
|||
pretrained_model = global_config.get('pretrained_model')
|
||||
best_model_dict = {}
|
||||
if checkpoints:
|
||||
if checkpoints.endswith('pdparams'):
|
||||
if checkpoints.endswith('.pdparams'):
|
||||
checkpoints = checkpoints.replace('.pdparams', '')
|
||||
assert os.path.exists(checkpoints + ".pdopt"), \
|
||||
f"The {checkpoints}.pdopt does not exists!"
|
||||
load_pretrained_params(model, checkpoints)
|
||||
optim_dict = paddle.load(checkpoints + '.pdopt')
|
||||
assert os.path.exists(checkpoints + ".pdparams"), \
|
||||
"The {}.pdparams does not exists!".format(checkpoints)
|
||||
|
||||
# load params from trained model
|
||||
params = paddle.load(checkpoints + '.pdparams')
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
for key, value in state_dict.items():
|
||||
if key not in params:
|
||||
logger.warning("{} not in loaded params {} !".format(
|
||||
key, params.keys()))
|
||||
continue
|
||||
pre_value = params[key]
|
||||
if list(value.shape) == list(pre_value.shape):
|
||||
new_state_dict[key] = pre_value
|
||||
else:
|
||||
logger.warning(
|
||||
"The shape of model params {} {} not matched with loaded params shape {} !".
|
||||
format(key, value.shape, pre_value.shape))
|
||||
model.set_state_dict(new_state_dict)
|
||||
|
||||
if optimizer is not None:
|
||||
optimizer.set_state_dict(optim_dict)
|
||||
if os.path.exists(checkpoints + '.pdopt'):
|
||||
optim_dict = paddle.load(checkpoints + '.pdopt')
|
||||
optimizer.set_state_dict(optim_dict)
|
||||
else:
|
||||
logger.warning(
|
||||
"{}.pdopt is not exists, params of optimizer is not loaded".
|
||||
format(checkpoints))
|
||||
|
||||
if os.path.exists(checkpoints + '.states'):
|
||||
with open(checkpoints + '.states', 'rb') as f:
|
||||
|
@ -80,10 +103,10 @@ def load_model(config, model, optimizer=None):
|
|||
|
||||
def load_pretrained_params(model, path):
|
||||
logger = get_logger()
|
||||
if path.endswith('pdparams'):
|
||||
if path.endswith('.pdparams'):
|
||||
path = path.replace('.pdparams', '')
|
||||
assert os.path.exists(path + ".pdparams"), \
|
||||
f"The {path}.pdparams does not exists!"
|
||||
"The {}.pdparams does not exists!".format(path)
|
||||
|
||||
params = paddle.load(path + '.pdparams')
|
||||
state_dict = model.state_dict()
|
||||
|
@ -92,11 +115,11 @@ def load_pretrained_params(model, path):
|
|||
if list(state_dict[k1].shape) == list(params[k2].shape):
|
||||
new_state_dict[k1] = params[k2]
|
||||
else:
|
||||
logger.info(
|
||||
f"The shape of model params {k1} {state_dict[k1].shape} not matched with loaded params {k2} {params[k2].shape} !"
|
||||
)
|
||||
logger.warning(
|
||||
"The shape of model params {} {} not matched with loaded params {} {} !".
|
||||
format(k1, state_dict[k1].shape, k2, params[k2].shape))
|
||||
model.set_state_dict(new_state_dict)
|
||||
logger.info(f"load pretrain successful from {path}")
|
||||
logger.info("load pretrain successful from {}".format(path))
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,182 @@
|
|||
# 视觉问答(VQA)
|
||||
|
||||
VQA主要特性如下:
|
||||
|
||||
- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)模型以及PP-OCR预测引擎。
|
||||
- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取(比如判断问题对)
|
||||
- 支持SER任务与OCR引擎联合的端到端系统预测与评估。
|
||||
- 支持SER任务和RE任务的自定义训练
|
||||
|
||||
|
||||
本项目是 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) 在 Paddle 2.2上的开源实现,
|
||||
包含了在 [XFUND数据集](https://github.com/doc-analysis/XFUND) 上的微调代码。
|
||||
|
||||
## 1. 效果演示
|
||||
|
||||
**注意:** 测试图片来源于XFUN数据集。
|
||||
|
||||
### 1.1 SER
|
||||
|
||||
<div align="center">
|
||||
<img src="./images/result_ser/zh_val_0_ser.jpg" width = "600" />
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<img src="./images/result_ser/zh_val_42_ser.jpg" width = "600" />
|
||||
</div>
|
||||
|
||||
其中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别,在OCR检测框的左上方也标出了对应的类别和OCR识别结果。
|
||||
|
||||
|
||||
### 1.2 RE
|
||||
|
||||
* Coming soon!
|
||||
|
||||
|
||||
|
||||
## 2. 安装
|
||||
|
||||
### 2.1 安装依赖
|
||||
|
||||
- **(1) 安装PaddlePaddle**
|
||||
|
||||
```bash
|
||||
pip3 install --upgrade pip
|
||||
|
||||
# GPU安装
|
||||
python3 -m pip install paddlepaddle-gpu==2.2 -i https://mirror.baidu.com/pypi/simple
|
||||
|
||||
# CPU安装
|
||||
python3 -m pip install paddlepaddle==2.2 -i https://mirror.baidu.com/pypi/simple
|
||||
|
||||
```
|
||||
更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
|
||||
|
||||
|
||||
### 2.2 安装PaddleOCR(包含 PP-OCR 和 VQA )
|
||||
|
||||
- **(1)pip快速安装PaddleOCR whl包(仅预测)**
|
||||
|
||||
```bash
|
||||
pip install "paddleocr>=2.2" # 推荐使用2.2+版本
|
||||
```
|
||||
|
||||
- **(2)下载VQA源码(预测+训练)**
|
||||
|
||||
```bash
|
||||
【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR
|
||||
|
||||
# 如果因为网络问题无法pull成功,也可选择使用码云上的托管:
|
||||
git clone https://gitee.com/paddlepaddle/PaddleOCR
|
||||
|
||||
# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。
|
||||
```
|
||||
|
||||
- **(3)安装PaddleNLP**
|
||||
|
||||
```bash
|
||||
# 需要使用PaddleNLP最新的代码版本进行安装
|
||||
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
|
||||
cd PaddleNLP
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
|
||||
- **(4)安装VQA的`requirements`**
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 3. 使用
|
||||
|
||||
|
||||
### 3.1 数据和预训练模型准备
|
||||
|
||||
处理好的XFUN中文数据集下载地址:[https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar)。
|
||||
|
||||
|
||||
下载并解压该数据集,解压后将数据集放置在当前目录下。
|
||||
|
||||
```shell
|
||||
wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar
|
||||
```
|
||||
|
||||
如果希望转换XFUN中其他语言的数据集,可以参考[XFUN数据转换脚本](helper/trans_xfun_data.py)。
|
||||
|
||||
如果希望直接体验预测过程,可以下载我们提供的SER预训练模型,跳过训练过程,直接预测即可。
|
||||
|
||||
* SER任务预训练模型下载链接:[链接](https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar)
|
||||
* RE任务预训练模型下载链接:coming soon!
|
||||
|
||||
|
||||
### 3.2 SER任务
|
||||
|
||||
* 启动训练
|
||||
|
||||
```shell
|
||||
python train_ser.py \
|
||||
--model_name_or_path "layoutxlm-base-uncased" \
|
||||
--train_data_dir "XFUND/zh_train/image" \
|
||||
--train_label_path "XFUND/zh_train/xfun_normalize_train.json" \
|
||||
--eval_data_dir "XFUND/zh_val/image" \
|
||||
--eval_label_path "XFUND/zh_val/xfun_normalize_val.json" \
|
||||
--num_train_epochs 200 \
|
||||
--eval_steps 10 \
|
||||
--save_steps 500 \
|
||||
--output_dir "./output/ser/" \
|
||||
--learning_rate 5e-5 \
|
||||
--warmup_steps 50 \
|
||||
--evaluate_during_training \
|
||||
--seed 2048
|
||||
```
|
||||
|
||||
最终会打印出`precision`, `recall`, `f1`等指标,如下所示。
|
||||
|
||||
```
|
||||
best metrics: {'loss': 1.066644651549203, 'precision': 0.8770182068017863, 'recall': 0.9361936193619362, 'f1': 0.9056402979780063}
|
||||
```
|
||||
|
||||
模型和训练日志会保存在`./output/ser/`文件夹中。
|
||||
|
||||
* 使用评估集合中提供的OCR识别结果进行预测
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3.7 infer_ser.py \
|
||||
--model_name_or_path "./PP-Layout_v1.0_ser_pretrained/" \
|
||||
--output_dir "output_res/" \
|
||||
--infer_imgs "XFUND/zh_val/image/" \
|
||||
--ocr_json_path "XFUND/zh_val/xfun_normalize_val.json"
|
||||
```
|
||||
|
||||
最终会在`output_res`目录下保存预测结果可视化图像以及预测结果文本文件,文件名为`infer_results.txt`。
|
||||
|
||||
* 使用`OCR引擎 + SER`串联结果
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python3.7 infer_ser_e2e.py \
|
||||
--model_name_or_path "./output/PP-Layout_v1.0_ser_pretrained/" \
|
||||
--max_seq_length 512 \
|
||||
--output_dir "output_res_e2e/"
|
||||
```
|
||||
|
||||
* 对`OCR引擎 + SER`预测系统进行端到端评估
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
python helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
|
||||
```
|
||||
|
||||
|
||||
3.3 RE任务
|
||||
|
||||
coming soon!
|
||||
|
||||
|
||||
## 参考链接
|
||||
|
||||
- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf
|
||||
- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm
|
||||
- XFUND dataset, https://github.com/doc-analysis/XFUND
|
|
@ -0,0 +1,262 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
# import Polygon
|
||||
import shapely
|
||||
from shapely.geometry import Polygon
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
import operator
|
||||
import editdistance
|
||||
import argparse
|
||||
import json
|
||||
import copy
|
||||
|
||||
|
||||
def parse_ser_results_fp(fp, fp_type="gt", ignore_background=True):
|
||||
# img/zh_val_0.jpg {
|
||||
# "height": 3508,
|
||||
# "width": 2480,
|
||||
# "ocr_info": [
|
||||
# {"text": "Maribyrnong", "label": "other", "bbox": [1958, 144, 2184, 198]},
|
||||
# {"text": "CITYCOUNCIL", "label": "other", "bbox": [2052, 183, 2171, 214]},
|
||||
# ]
|
||||
assert fp_type in ["gt", "pred"]
|
||||
key = "label" if fp_type == "gt" else "pred"
|
||||
res_dict = dict()
|
||||
with open(fp, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
|
||||
for _, line in enumerate(lines):
|
||||
img_path, info = line.strip().split("\t")
|
||||
# get key
|
||||
image_name = os.path.basename(img_path)
|
||||
res_dict[image_name] = []
|
||||
# get infos
|
||||
json_info = json.loads(info)
|
||||
for single_ocr_info in json_info["ocr_info"]:
|
||||
label = single_ocr_info[key].upper()
|
||||
if label in ["O", "OTHERS", "OTHER"]:
|
||||
label = "O"
|
||||
if ignore_background and label == "O":
|
||||
continue
|
||||
single_ocr_info["label"] = label
|
||||
res_dict[image_name].append(copy.deepcopy(single_ocr_info))
|
||||
return res_dict
|
||||
|
||||
|
||||
def polygon_from_str(polygon_points):
|
||||
"""
|
||||
Create a shapely polygon object from gt or dt line.
|
||||
"""
|
||||
polygon_points = np.array(polygon_points).reshape(4, 2)
|
||||
polygon = Polygon(polygon_points).convex_hull
|
||||
return polygon
|
||||
|
||||
|
||||
def polygon_iou(poly1, poly2):
|
||||
"""
|
||||
Intersection over union between two shapely polygons.
|
||||
"""
|
||||
if not poly1.intersects(
|
||||
poly2): # this test is fast and can accelerate calculation
|
||||
iou = 0
|
||||
else:
|
||||
try:
|
||||
inter_area = poly1.intersection(poly2).area
|
||||
union_area = poly1.area + poly2.area - inter_area
|
||||
iou = float(inter_area) / union_area
|
||||
except shapely.geos.TopologicalError:
|
||||
# except Exception as e:
|
||||
# print(e)
|
||||
print('shapely.geos.TopologicalError occured, iou set to 0')
|
||||
iou = 0
|
||||
return iou
|
||||
|
||||
|
||||
def ed(args, str1, str2):
|
||||
if args.ignore_space:
|
||||
str1 = str1.replace(" ", "")
|
||||
str2 = str2.replace(" ", "")
|
||||
if args.ignore_case:
|
||||
str1 = str1.lower()
|
||||
str2 = str2.lower()
|
||||
return editdistance.eval(str1, str2)
|
||||
|
||||
|
||||
def convert_bbox_to_polygon(bbox):
|
||||
"""
|
||||
bbox : [x1, y1, x2, y2]
|
||||
output: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
|
||||
"""
|
||||
xmin, ymin, xmax, ymax = bbox
|
||||
poly = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
|
||||
return poly
|
||||
|
||||
|
||||
def eval_e2e(args):
|
||||
# gt
|
||||
gt_results = parse_ser_results_fp(args.gt_json_path, "gt",
|
||||
args.ignore_background)
|
||||
# pred
|
||||
dt_results = parse_ser_results_fp(args.pred_json_path, "pred",
|
||||
args.ignore_background)
|
||||
assert set(gt_results.keys()) == set(dt_results.keys())
|
||||
|
||||
iou_thresh = args.iou_thres
|
||||
num_gt_chars = 0
|
||||
gt_count = 0
|
||||
dt_count = 0
|
||||
hit = 0
|
||||
ed_sum = 0
|
||||
|
||||
for img_name in gt_results:
|
||||
gt_info = gt_results[img_name]
|
||||
gt_count += len(gt_info)
|
||||
|
||||
dt_info = dt_results[img_name]
|
||||
dt_count += len(dt_info)
|
||||
|
||||
dt_match = [False] * len(dt_info)
|
||||
gt_match = [False] * len(gt_info)
|
||||
|
||||
all_ious = defaultdict(tuple)
|
||||
# gt: {text, label, bbox or poly}
|
||||
for index_gt, gt in enumerate(gt_info):
|
||||
if "poly" not in gt:
|
||||
gt["poly"] = convert_bbox_to_polygon(gt["bbox"])
|
||||
gt_poly = polygon_from_str(gt["poly"])
|
||||
for index_dt, dt in enumerate(dt_info):
|
||||
if "poly" not in dt:
|
||||
dt["poly"] = convert_bbox_to_polygon(dt["bbox"])
|
||||
dt_poly = polygon_from_str(dt["poly"])
|
||||
iou = polygon_iou(dt_poly, gt_poly)
|
||||
if iou >= iou_thresh:
|
||||
all_ious[(index_gt, index_dt)] = iou
|
||||
sorted_ious = sorted(
|
||||
all_ious.items(), key=operator.itemgetter(1), reverse=True)
|
||||
sorted_gt_dt_pairs = [item[0] for item in sorted_ious]
|
||||
|
||||
# matched gt and dt
|
||||
for gt_dt_pair in sorted_gt_dt_pairs:
|
||||
index_gt, index_dt = gt_dt_pair
|
||||
if gt_match[index_gt] == False and dt_match[index_dt] == False:
|
||||
gt_match[index_gt] = True
|
||||
dt_match[index_dt] = True
|
||||
# ocr rec results
|
||||
gt_text = gt_info[index_gt]["text"]
|
||||
dt_text = dt_info[index_dt]["text"]
|
||||
|
||||
# ser results
|
||||
gt_label = gt_info[index_gt]["label"]
|
||||
dt_label = dt_info[index_dt]["pred"]
|
||||
|
||||
if True: # ignore_masks[index_gt] == '0':
|
||||
ed_sum += ed(args, gt_text, dt_text)
|
||||
num_gt_chars += len(gt_text)
|
||||
if gt_text == dt_text:
|
||||
if args.ignore_ser_prediction or gt_label == dt_label:
|
||||
hit += 1
|
||||
|
||||
# unmatched dt
|
||||
for tindex, dt_match_flag in enumerate(dt_match):
|
||||
if dt_match_flag == False:
|
||||
dt_text = dt_info[tindex]["text"]
|
||||
gt_text = ""
|
||||
ed_sum += ed(args, dt_text, gt_text)
|
||||
|
||||
# unmatched gt
|
||||
for tindex, gt_match_flag in enumerate(gt_match):
|
||||
if gt_match_flag == False:
|
||||
dt_text = ""
|
||||
gt_text = gt_info[tindex]["text"]
|
||||
ed_sum += ed(args, gt_text, dt_text)
|
||||
num_gt_chars += len(gt_text)
|
||||
|
||||
eps = 1e-9
|
||||
print("config: ", args)
|
||||
print('hit, dt_count, gt_count', hit, dt_count, gt_count)
|
||||
precision = hit / (dt_count + eps)
|
||||
recall = hit / (gt_count + eps)
|
||||
fmeasure = 2.0 * precision * recall / (precision + recall + eps)
|
||||
avg_edit_dist_img = ed_sum / len(gt_results)
|
||||
avg_edit_dist_field = ed_sum / (gt_count + eps)
|
||||
character_acc = 1 - ed_sum / (num_gt_chars + eps)
|
||||
|
||||
print('character_acc: %.2f' % (character_acc * 100) + "%")
|
||||
print('avg_edit_dist_field: %.2f' % (avg_edit_dist_field))
|
||||
print('avg_edit_dist_img: %.2f' % (avg_edit_dist_img))
|
||||
print('precision: %.2f' % (precision * 100) + "%")
|
||||
print('recall: %.2f' % (recall * 100) + "%")
|
||||
print('fmeasure: %.2f' % (fmeasure * 100) + "%")
|
||||
|
||||
return
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
"""
|
||||
|
||||
def str2bool(v):
|
||||
return v.lower() in ("true", "t", "1")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument(
|
||||
"--gt_json_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True, )
|
||||
parser.add_argument(
|
||||
"--pred_json_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True, )
|
||||
|
||||
parser.add_argument("--iou_thres", default=0.5, type=float)
|
||||
|
||||
parser.add_argument(
|
||||
"--ignore_case",
|
||||
default=False,
|
||||
type=str2bool,
|
||||
help="whether to do lower case for the strs")
|
||||
|
||||
parser.add_argument(
|
||||
"--ignore_space",
|
||||
default=True,
|
||||
type=str2bool,
|
||||
help="whether to ignore space")
|
||||
|
||||
parser.add_argument(
|
||||
"--ignore_background",
|
||||
default=True,
|
||||
type=str2bool,
|
||||
help="whether to ignore other label")
|
||||
|
||||
parser.add_argument(
|
||||
"--ignore_ser_prediction",
|
||||
default=False,
|
||||
type=str2bool,
|
||||
help="whether to ignore ocr pred results")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
eval_e2e(args)
|
|
@ -0,0 +1,52 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def transfer_xfun_data(json_path=None, output_file=None):
|
||||
with open(json_path, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
|
||||
json_info = json.loads(lines[0])
|
||||
documents = json_info["documents"]
|
||||
label_info = {}
|
||||
with open(output_file, "w") as fout:
|
||||
for idx, document in enumerate(documents):
|
||||
img_info = document["img"]
|
||||
document = document["document"]
|
||||
image_path = img_info["fname"]
|
||||
|
||||
label_info["height"] = img_info["height"]
|
||||
label_info["width"] = img_info["width"]
|
||||
|
||||
label_info["ocr_info"] = []
|
||||
|
||||
for doc in document:
|
||||
label_info["ocr_info"].append({
|
||||
"text": doc["text"],
|
||||
"label": doc["label"],
|
||||
"bbox": doc["box"],
|
||||
"id": doc["id"],
|
||||
"linking": doc["linking"],
|
||||
"words": doc["words"]
|
||||
})
|
||||
|
||||
fout.write(image_path + "\t" + json.dumps(
|
||||
label_info, ensure_ascii=False) + "\n")
|
||||
|
||||
print("===ok====")
|
||||
|
||||
|
||||
transfer_xfun_data("./xfun/zh.val.json", "./xfun_normalize_val.json")
|
Binary file not shown.
After Width: | Height: | Size: 1.5 MiB |
Binary file not shown.
After Width: | Height: | Size: 1.8 MiB |
Binary file not shown.
After Width: | Height: | Size: 1.2 MiB |
Binary file not shown.
After Width: | Height: | Size: 1.6 MiB |
|
@ -0,0 +1,279 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from copy import deepcopy
|
||||
|
||||
import paddle
|
||||
|
||||
# relative reference
|
||||
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
|
||||
|
||||
def pad_sentences(tokenizer,
|
||||
encoded_inputs,
|
||||
max_seq_len=512,
|
||||
pad_to_max_seq_len=True,
|
||||
return_attention_mask=True,
|
||||
return_token_type_ids=True,
|
||||
return_overflowing_tokens=False,
|
||||
return_special_tokens_mask=False):
|
||||
# Padding with larger size, reshape is carried out
|
||||
max_seq_len = (
|
||||
len(encoded_inputs["input_ids"]) // max_seq_len + 1) * max_seq_len
|
||||
|
||||
needs_to_be_padded = pad_to_max_seq_len and \
|
||||
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_seq_len - len(encoded_inputs["input_ids"])
|
||||
if tokenizer.padding_side == 'right':
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"]) + [0] * difference
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = (
|
||||
encoded_inputs["token_type_ids"] +
|
||||
[tokenizer.pad_token_type_id] * difference)
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs[
|
||||
"special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs["input_ids"] = encoded_inputs[
|
||||
"input_ids"] + [tokenizer.pad_token_id] * difference
|
||||
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [[0, 0, 0, 0]
|
||||
] * difference
|
||||
else:
|
||||
assert False, f"padding_side of tokenizer just supports [\"right\"] but got {tokenizer.padding_side}"
|
||||
else:
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"])
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def split_page(encoded_inputs, max_seq_len=512):
|
||||
"""
|
||||
truncate is often used in training process
|
||||
"""
|
||||
for key in encoded_inputs:
|
||||
encoded_inputs[key] = paddle.to_tensor(encoded_inputs[key])
|
||||
if encoded_inputs[key].ndim <= 1: # for input_ids, att_mask and so on
|
||||
encoded_inputs[key] = encoded_inputs[key].reshape([-1, max_seq_len])
|
||||
else: # for bbox
|
||||
encoded_inputs[key] = encoded_inputs[key].reshape(
|
||||
[-1, max_seq_len, 4])
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def preprocess(
|
||||
tokenizer,
|
||||
ori_img,
|
||||
ocr_info,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=-100,
|
||||
max_seq_len=512,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True, ):
|
||||
ocr_info = deepcopy(ocr_info)
|
||||
height = ori_img.shape[0]
|
||||
width = ori_img.shape[1]
|
||||
|
||||
img = cv2.resize(ori_img,
|
||||
(224, 224)).transpose([2, 0, 1]).astype(np.float32)
|
||||
|
||||
segment_offset_id = []
|
||||
words_list = []
|
||||
bbox_list = []
|
||||
input_ids_list = []
|
||||
token_type_ids_list = []
|
||||
|
||||
for info in ocr_info:
|
||||
# x1, y1, x2, y2
|
||||
bbox = info["bbox"]
|
||||
bbox[0] = int(bbox[0] * 1000.0 / width)
|
||||
bbox[2] = int(bbox[2] * 1000.0 / width)
|
||||
bbox[1] = int(bbox[1] * 1000.0 / height)
|
||||
bbox[3] = int(bbox[3] * 1000.0 / height)
|
||||
|
||||
text = info["text"]
|
||||
encode_res = tokenizer.encode(
|
||||
text, pad_to_max_seq_len=False, return_attention_mask=True)
|
||||
|
||||
if not add_special_ids:
|
||||
# TODO: use tok.all_special_ids to remove
|
||||
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
|
||||
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:-1]
|
||||
encode_res["attention_mask"] = encode_res["attention_mask"][1:-1]
|
||||
|
||||
input_ids_list.extend(encode_res["input_ids"])
|
||||
token_type_ids_list.extend(encode_res["token_type_ids"])
|
||||
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
|
||||
words_list.append(text)
|
||||
segment_offset_id.append(len(input_ids_list))
|
||||
|
||||
encoded_inputs = {
|
||||
"input_ids": input_ids_list,
|
||||
"token_type_ids": token_type_ids_list,
|
||||
"bbox": bbox_list,
|
||||
"attention_mask": [1] * len(input_ids_list),
|
||||
}
|
||||
|
||||
encoded_inputs = pad_sentences(
|
||||
tokenizer,
|
||||
encoded_inputs,
|
||||
max_seq_len=max_seq_len,
|
||||
return_attention_mask=return_attention_mask)
|
||||
|
||||
encoded_inputs = split_page(encoded_inputs)
|
||||
|
||||
fake_bs = encoded_inputs["input_ids"].shape[0]
|
||||
|
||||
encoded_inputs["image"] = paddle.to_tensor(img).unsqueeze(0).expand(
|
||||
[fake_bs] + list(img.shape))
|
||||
|
||||
encoded_inputs["segment_offset_id"] = segment_offset_id
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def postprocess(attention_mask, preds, label_map_path):
|
||||
if isinstance(preds, paddle.Tensor):
|
||||
preds = preds.numpy()
|
||||
preds = np.argmax(preds, axis=2)
|
||||
|
||||
_, label_map = get_bio_label_maps(label_map_path)
|
||||
|
||||
preds_list = [[] for _ in range(preds.shape[0])]
|
||||
|
||||
# keep batch info
|
||||
for i in range(preds.shape[0]):
|
||||
for j in range(preds.shape[1]):
|
||||
if attention_mask[i][j] == 1:
|
||||
preds_list[i].append(label_map[preds[i][j]])
|
||||
|
||||
return preds_list
|
||||
|
||||
|
||||
def merge_preds_list_with_ocr_info(label_map_path, ocr_info, segment_offset_id,
|
||||
preds_list):
|
||||
# must ensure the preds_list is generated from the same image
|
||||
preds = [p for pred in preds_list for p in pred]
|
||||
label2id_map, _ = get_bio_label_maps(label_map_path)
|
||||
for key in label2id_map:
|
||||
if key.startswith("I-"):
|
||||
label2id_map[key] = label2id_map["B" + key[1:]]
|
||||
|
||||
id2label_map = dict()
|
||||
for key in label2id_map:
|
||||
val = label2id_map[key]
|
||||
if key == "O":
|
||||
id2label_map[val] = key
|
||||
if key.startswith("B-") or key.startswith("I-"):
|
||||
id2label_map[val] = key[2:]
|
||||
else:
|
||||
id2label_map[val] = key
|
||||
|
||||
for idx in range(len(segment_offset_id)):
|
||||
if idx == 0:
|
||||
start_id = 0
|
||||
else:
|
||||
start_id = segment_offset_id[idx - 1]
|
||||
|
||||
end_id = segment_offset_id[idx]
|
||||
|
||||
curr_pred = preds[start_id:end_id]
|
||||
curr_pred = [label2id_map[p] for p in curr_pred]
|
||||
|
||||
if len(curr_pred) <= 0:
|
||||
pred_id = 0
|
||||
else:
|
||||
counts = np.bincount(curr_pred)
|
||||
pred_id = np.argmax(counts)
|
||||
ocr_info[idx]["pred_id"] = int(pred_id)
|
||||
ocr_info[idx]["pred"] = id2label_map[pred_id]
|
||||
return ocr_info
|
||||
|
||||
|
||||
@paddle.no_grad()
|
||||
def infer(args):
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# init token and model
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
# model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
model.eval()
|
||||
|
||||
# load ocr results json
|
||||
ocr_results = dict()
|
||||
with open(args.ocr_json_path, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
img_name, json_info = line.split("\t")
|
||||
ocr_results[os.path.basename(img_name)] = json.loads(json_info)
|
||||
|
||||
# get infer img list
|
||||
infer_imgs = get_image_file_list(args.infer_imgs)
|
||||
|
||||
# loop for infer
|
||||
with open(os.path.join(args.output_dir, "infer_results.txt"), "w") as fout:
|
||||
for idx, img_path in enumerate(infer_imgs):
|
||||
print("process: [{}/{}]".format(idx, len(infer_imgs), img_path))
|
||||
|
||||
img = cv2.imread(img_path)
|
||||
|
||||
ocr_info = ocr_results[os.path.basename(img_path)]["ocr_info"]
|
||||
inputs = preprocess(
|
||||
tokenizer=tokenizer,
|
||||
ori_img=img,
|
||||
ocr_info=ocr_info,
|
||||
max_seq_len=args.max_seq_length)
|
||||
|
||||
outputs = model(
|
||||
input_ids=inputs["input_ids"],
|
||||
bbox=inputs["bbox"],
|
||||
image=inputs["image"],
|
||||
token_type_ids=inputs["token_type_ids"],
|
||||
attention_mask=inputs["attention_mask"])
|
||||
|
||||
preds = outputs[0]
|
||||
preds = postprocess(inputs["attention_mask"], preds,
|
||||
args.label_map_path)
|
||||
ocr_info = merge_preds_list_with_ocr_info(
|
||||
args.label_map_path, ocr_info, inputs["segment_offset_id"],
|
||||
preds)
|
||||
|
||||
fout.write(img_path + "\t" + json.dumps(
|
||||
{
|
||||
"ocr_info": ocr_info,
|
||||
}, ensure_ascii=False) + "\n")
|
||||
|
||||
img_res = draw_ser_results(img, ocr_info)
|
||||
cv2.imwrite(
|
||||
os.path.join(args.output_dir, os.path.basename(img_path)),
|
||||
img_res)
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
infer(args)
|
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from copy import deepcopy
|
||||
from PIL import Image
|
||||
|
||||
import paddle
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
|
||||
# relative reference
|
||||
from utils import parse_args, get_image_file_list, draw_ser_results, get_bio_label_maps, build_ocr_engine
|
||||
|
||||
from utils import pad_sentences, split_page, preprocess, postprocess, merge_preds_list_with_ocr_info
|
||||
|
||||
|
||||
def trans_poly_to_bbox(poly):
|
||||
x1 = np.min([p[0] for p in poly])
|
||||
x2 = np.max([p[0] for p in poly])
|
||||
y1 = np.min([p[1] for p in poly])
|
||||
y2 = np.max([p[1] for p in poly])
|
||||
return [x1, y1, x2, y2]
|
||||
|
||||
|
||||
def parse_ocr_info_for_ser(ocr_result):
|
||||
ocr_info = []
|
||||
for res in ocr_result:
|
||||
ocr_info.append({
|
||||
"text": res[1][0],
|
||||
"bbox": trans_poly_to_bbox(res[0]),
|
||||
"poly": res[0],
|
||||
})
|
||||
return ocr_info
|
||||
|
||||
|
||||
@paddle.no_grad()
|
||||
def infer(args):
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# init token and model
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification.from_pretrained(
|
||||
args.model_name_or_path)
|
||||
model.eval()
|
||||
|
||||
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||
label2id_map_for_draw = dict()
|
||||
for key in label2id_map:
|
||||
if key.startswith("I-"):
|
||||
label2id_map_for_draw[key] = label2id_map["B" + key[1:]]
|
||||
else:
|
||||
label2id_map_for_draw[key] = label2id_map[key]
|
||||
|
||||
# get infer img list
|
||||
infer_imgs = get_image_file_list(args.infer_imgs)
|
||||
|
||||
ocr_engine = build_ocr_engine(args.ocr_rec_model_dir,
|
||||
args.ocr_det_model_dir)
|
||||
|
||||
# loop for infer
|
||||
with open(os.path.join(args.output_dir, "infer_results.txt"), "w") as fout:
|
||||
for idx, img_path in enumerate(infer_imgs):
|
||||
print("process: [{}/{}]".format(idx, len(infer_imgs), img_path))
|
||||
|
||||
img = cv2.imread(img_path)
|
||||
|
||||
ocr_result = ocr_engine.ocr(img_path, cls=False)
|
||||
|
||||
ocr_info = parse_ocr_info_for_ser(ocr_result)
|
||||
|
||||
inputs = preprocess(
|
||||
tokenizer=tokenizer,
|
||||
ori_img=img,
|
||||
ocr_info=ocr_info,
|
||||
max_seq_len=args.max_seq_length)
|
||||
|
||||
outputs = model(
|
||||
input_ids=inputs["input_ids"],
|
||||
bbox=inputs["bbox"],
|
||||
image=inputs["image"],
|
||||
token_type_ids=inputs["token_type_ids"],
|
||||
attention_mask=inputs["attention_mask"])
|
||||
|
||||
preds = outputs[0]
|
||||
preds = postprocess(inputs["attention_mask"], preds, id2label_map)
|
||||
ocr_info = merge_preds_list_with_ocr_info(
|
||||
ocr_info, inputs["segment_offset_id"], preds,
|
||||
label2id_map_for_draw)
|
||||
|
||||
fout.write(img_path + "\t" + json.dumps(
|
||||
{
|
||||
"ocr_info": ocr_info,
|
||||
}, ensure_ascii=False) + "\n")
|
||||
|
||||
img_res = draw_ser_results(img, ocr_info)
|
||||
cv2.imwrite(
|
||||
os.path.join(args.output_dir,
|
||||
os.path.splitext(os.path.basename(img_path))[0] +
|
||||
"_ser.jpg"), img_res)
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
infer(args)
|
|
@ -0,0 +1,3 @@
|
|||
QUESTION
|
||||
ANSWER
|
||||
HEADER
|
|
@ -0,0 +1,2 @@
|
|||
sentencepiece
|
||||
yacs
|
|
@ -0,0 +1,313 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import random
|
||||
import copy
|
||||
import logging
|
||||
|
||||
import argparse
|
||||
import paddle
|
||||
import numpy as np
|
||||
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
|
||||
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
|
||||
from xfun import XFUNDataset
|
||||
from utils import parse_args
|
||||
from utils import get_bio_label_maps
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
paddle.seed(args.seed)
|
||||
|
||||
|
||||
def train(args):
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
filename=os.path.join(args.output_dir, "train.log")
|
||||
if paddle.distributed.get_rank() == 0 else None,
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO
|
||||
if paddle.distributed.get_rank() == 0 else logging.WARN, )
|
||||
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.DEBUG)
|
||||
logger.addHandler(ch)
|
||||
|
||||
label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
|
||||
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||
|
||||
# dist mode
|
||||
if paddle.distributed.get_world_size() > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path)
|
||||
base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path)
|
||||
model = LayoutXLMForTokenClassification(
|
||||
base_model, num_classes=len(label2id_map), dropout=None)
|
||||
|
||||
# dist mode
|
||||
if paddle.distributed.get_world_size() > 1:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
train_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.train_data_dir,
|
||||
label_path=args.train_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
train_sampler = paddle.io.DistributedBatchSampler(
|
||||
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(
|
||||
1, paddle.distributed.get_world_size())
|
||||
|
||||
train_dataloader = paddle.io.DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
num_workers=0,
|
||||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
t_total = len(train_dataloader) * args.num_train_epochs
|
||||
|
||||
# build linear decay with warmup lr sch
|
||||
lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
|
||||
learning_rate=args.learning_rate,
|
||||
decay_steps=t_total,
|
||||
end_lr=0.0,
|
||||
power=1.0)
|
||||
if args.warmup_steps > 0:
|
||||
lr_scheduler = paddle.optimizer.lr.LinearWarmup(
|
||||
lr_scheduler,
|
||||
args.warmup_steps,
|
||||
start_lr=0,
|
||||
end_lr=args.learning_rate, )
|
||||
|
||||
optimizer = paddle.optimizer.AdamW(
|
||||
learning_rate=lr_scheduler,
|
||||
parameters=model.parameters(),
|
||||
epsilon=args.adam_epsilon,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d",
|
||||
args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed) = %d",
|
||||
args.train_batch_size * paddle.distributed.get_world_size(), )
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss = 0.0
|
||||
set_seed(args)
|
||||
best_metrics = None
|
||||
|
||||
for epoch_id in range(args.num_train_epochs):
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
model.train()
|
||||
outputs = model(**batch)
|
||||
# model outputs are always tuple in ppnlp (see doc)
|
||||
loss = outputs[0]
|
||||
loss = loss.mean()
|
||||
logger.info(
|
||||
"[epoch {}/{}][iter: {}/{}] lr: {:.5f}, train loss: {:.5f}, ".
|
||||
format(epoch_id, args.num_train_epochs, step,
|
||||
len(train_dataloader),
|
||||
lr_scheduler.get_lr(), loss.numpy()[0]))
|
||||
|
||||
loss.backward()
|
||||
tr_loss += loss.item()
|
||||
optimizer.step()
|
||||
lr_scheduler.step() # Update learning rate schedule
|
||||
optimizer.clear_grad()
|
||||
global_step += 1
|
||||
|
||||
if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and
|
||||
global_step % args.eval_steps == 0):
|
||||
# Log metrics
|
||||
# Only evaluate when single GPU otherwise metrics may not average well
|
||||
if paddle.distributed.get_rank(
|
||||
) == 0 and args.evaluate_during_training:
|
||||
results, _ = evaluate(
|
||||
args,
|
||||
model,
|
||||
tokenizer,
|
||||
label2id_map,
|
||||
id2label_map,
|
||||
pad_token_label_id, )
|
||||
|
||||
if best_metrics is None or results["f1"] >= best_metrics[
|
||||
"f1"]:
|
||||
best_metrics = copy.deepcopy(results)
|
||||
output_dir = os.path.join(args.output_dir, "best_model")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
model.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
paddle.save(
|
||||
args,
|
||||
os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s",
|
||||
output_dir)
|
||||
|
||||
logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
|
||||
epoch_id, args.num_train_epochs, step,
|
||||
len(train_dataloader), results))
|
||||
if best_metrics is not None:
|
||||
logger.info("best metrics: {}".format(best_metrics))
|
||||
|
||||
if paddle.distributed.get_rank(
|
||||
) == 0 and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir,
|
||||
"checkpoint-{}".format(global_step))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
model.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
paddle.save(args,
|
||||
os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args,
|
||||
model,
|
||||
tokenizer,
|
||||
label2id_map,
|
||||
id2label_map,
|
||||
pad_token_label_id,
|
||||
prefix=""):
|
||||
eval_dataset = XFUNDataset(
|
||||
tokenizer,
|
||||
data_dir=args.eval_data_dir,
|
||||
label_path=args.eval_label_path,
|
||||
label2id_map=label2id_map,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
contains_re=False,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all')
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
|
||||
1, paddle.distributed.get_world_size())
|
||||
|
||||
eval_dataloader = paddle.io.DataLoader(
|
||||
eval_dataset,
|
||||
batch_size=args.eval_batch_size,
|
||||
num_workers=0,
|
||||
use_shared_memory=True,
|
||||
collate_fn=None, )
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation %s *****", prefix)
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
preds = None
|
||||
out_label_ids = None
|
||||
model.eval()
|
||||
for idx, batch in enumerate(eval_dataloader):
|
||||
with paddle.no_grad():
|
||||
outputs = model(**batch)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
tmp_eval_loss = tmp_eval_loss.mean()
|
||||
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
logger.info("[Eval]process: {}/{}, loss: {:.5f}".format(
|
||||
idx, len(eval_dataloader), tmp_eval_loss.numpy()[0]))
|
||||
|
||||
eval_loss += tmp_eval_loss.item()
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.numpy()
|
||||
out_label_ids = batch["labels"].numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.numpy(), axis=0)
|
||||
out_label_ids = np.append(
|
||||
out_label_ids, batch["labels"].numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
preds = np.argmax(preds, axis=2)
|
||||
|
||||
# label_map = {i: label.upper() for i, label in enumerate(labels)}
|
||||
|
||||
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
preds_list = [[] for _ in range(out_label_ids.shape[0])]
|
||||
|
||||
for i in range(out_label_ids.shape[0]):
|
||||
for j in range(out_label_ids.shape[1]):
|
||||
if out_label_ids[i, j] != pad_token_label_id:
|
||||
out_label_list[i].append(id2label_map[out_label_ids[i][j]])
|
||||
preds_list[i].append(id2label_map[preds[i][j]])
|
||||
|
||||
results = {
|
||||
"loss": eval_loss,
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
with open(os.path.join(args.output_dir, "test_gt.txt"), "w") as fout:
|
||||
for lbl in out_label_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
with open(os.path.join(args.output_dir, "test_pred.txt"), "w") as fout:
|
||||
for lbl in preds_list:
|
||||
for l in lbl:
|
||||
fout.write(l + "\t")
|
||||
fout.write("\n")
|
||||
|
||||
report = classification_report(out_label_list, preds_list)
|
||||
logger.info("\n" + report)
|
||||
|
||||
logger.info("***** Eval results %s *****", prefix)
|
||||
for key in sorted(results.keys()):
|
||||
logger.info(" %s = %s", key, str(results[key]))
|
||||
|
||||
return results, preds_list
|
||||
|
||||
|
||||
def print_arguments(args):
|
||||
"""print arguments"""
|
||||
print('----------- Configuration Arguments -----------')
|
||||
for arg, value in sorted(vars(args).items()):
|
||||
print('%s: %s' % (arg, value))
|
||||
print('------------------------------------------------')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
print_arguments(args)
|
||||
train(args)
|
|
@ -0,0 +1,328 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import cv2
|
||||
import random
|
||||
import numpy as np
|
||||
import imghdr
|
||||
from copy import deepcopy
|
||||
|
||||
import paddle
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
|
||||
def get_bio_label_maps(label_map_path):
|
||||
with open(label_map_path, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
lines = [line.strip() for line in lines]
|
||||
if "O" not in lines:
|
||||
lines.insert(0, "O")
|
||||
labels = []
|
||||
for line in lines:
|
||||
if line == "O":
|
||||
labels.append("O")
|
||||
else:
|
||||
labels.append("B-" + line)
|
||||
labels.append("I-" + line)
|
||||
label2id_map = {label: idx for idx, label in enumerate(labels)}
|
||||
id2label_map = {idx: label for idx, label in enumerate(labels)}
|
||||
return label2id_map, id2label_map
|
||||
|
||||
|
||||
def get_image_file_list(img_file):
|
||||
imgs_lists = []
|
||||
if img_file is None or not os.path.exists(img_file):
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
|
||||
img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'GIF'}
|
||||
if os.path.isfile(img_file) and imghdr.what(img_file) in img_end:
|
||||
imgs_lists.append(img_file)
|
||||
elif os.path.isdir(img_file):
|
||||
for single_file in os.listdir(img_file):
|
||||
file_path = os.path.join(img_file, single_file)
|
||||
if os.path.isfile(file_path) and imghdr.what(file_path) in img_end:
|
||||
imgs_lists.append(file_path)
|
||||
if len(imgs_lists) == 0:
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
imgs_lists = sorted(imgs_lists)
|
||||
return imgs_lists
|
||||
|
||||
|
||||
def draw_ser_results(image,
|
||||
ocr_results,
|
||||
font_path="../doc/fonts/simfang.ttf",
|
||||
font_size=18):
|
||||
np.random.seed(0)
|
||||
color = (np.random.permutation(range(255)),
|
||||
np.random.permutation(range(255)),
|
||||
np.random.permutation(range(255)))
|
||||
color_map = {
|
||||
idx: (color[0][idx], color[1][idx], color[2][idx])
|
||||
for idx in range(1, 255)
|
||||
}
|
||||
if isinstance(image, np.ndarray):
|
||||
image = Image.fromarray(image)
|
||||
img_new = image.copy()
|
||||
draw = ImageDraw.Draw(img_new)
|
||||
|
||||
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
||||
|
||||
for ocr_info in ocr_results:
|
||||
if ocr_info["pred_id"] not in color_map:
|
||||
continue
|
||||
color = color_map[ocr_info["pred_id"]]
|
||||
|
||||
# draw ocr results outline
|
||||
bbox = ocr_info["bbox"]
|
||||
bbox = ((bbox[0], bbox[1]), (bbox[2], bbox[3]))
|
||||
draw.rectangle(bbox, fill=color)
|
||||
|
||||
# draw ocr results
|
||||
text = "{}: {}".format(ocr_info["pred"], ocr_info["text"])
|
||||
start_y = max(0, bbox[0][1] - font_size)
|
||||
tw = font.getsize(text)[0]
|
||||
draw.rectangle(
|
||||
[(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1,
|
||||
start_y + font_size)],
|
||||
fill=(0, 0, 255))
|
||||
draw.text(
|
||||
(bbox[0][0] + 1, start_y), text, fill=(255, 255, 255), font=font)
|
||||
|
||||
img_new = Image.blend(image, img_new, 0.5)
|
||||
return np.array(img_new)
|
||||
|
||||
|
||||
def build_ocr_engine(rec_model_dir, det_model_dir):
|
||||
ocr_engine = PaddleOCR(
|
||||
rec_model_dir=rec_model_dir,
|
||||
det_model_dir=det_model_dir,
|
||||
use_angle_cls=False)
|
||||
return ocr_engine
|
||||
|
||||
|
||||
# pad sentences
|
||||
def pad_sentences(tokenizer,
|
||||
encoded_inputs,
|
||||
max_seq_len=512,
|
||||
pad_to_max_seq_len=True,
|
||||
return_attention_mask=True,
|
||||
return_token_type_ids=True,
|
||||
return_overflowing_tokens=False,
|
||||
return_special_tokens_mask=False):
|
||||
# Padding with larger size, reshape is carried out
|
||||
max_seq_len = (
|
||||
len(encoded_inputs["input_ids"]) // max_seq_len + 1) * max_seq_len
|
||||
|
||||
needs_to_be_padded = pad_to_max_seq_len and \
|
||||
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_seq_len - len(encoded_inputs["input_ids"])
|
||||
if tokenizer.padding_side == 'right':
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"]) + [0] * difference
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = (
|
||||
encoded_inputs["token_type_ids"] +
|
||||
[tokenizer.pad_token_type_id] * difference)
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs[
|
||||
"special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs["input_ids"] = encoded_inputs[
|
||||
"input_ids"] + [tokenizer.pad_token_id] * difference
|
||||
encoded_inputs["bbox"] = encoded_inputs["bbox"] + [[0, 0, 0, 0]
|
||||
] * difference
|
||||
else:
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"])
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def split_page(encoded_inputs, max_seq_len=512):
|
||||
"""
|
||||
truncate is often used in training process
|
||||
"""
|
||||
for key in encoded_inputs:
|
||||
encoded_inputs[key] = paddle.to_tensor(encoded_inputs[key])
|
||||
if encoded_inputs[key].ndim <= 1: # for input_ids, att_mask and so on
|
||||
encoded_inputs[key] = encoded_inputs[key].reshape([-1, max_seq_len])
|
||||
else: # for bbox
|
||||
encoded_inputs[key] = encoded_inputs[key].reshape(
|
||||
[-1, max_seq_len, 4])
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def preprocess(
|
||||
tokenizer,
|
||||
ori_img,
|
||||
ocr_info,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=-100,
|
||||
max_seq_len=512,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True, ):
|
||||
ocr_info = deepcopy(ocr_info)
|
||||
height = ori_img.shape[0]
|
||||
width = ori_img.shape[1]
|
||||
|
||||
img = cv2.resize(ori_img,
|
||||
(224, 224)).transpose([2, 0, 1]).astype(np.float32)
|
||||
|
||||
segment_offset_id = []
|
||||
words_list = []
|
||||
bbox_list = []
|
||||
input_ids_list = []
|
||||
token_type_ids_list = []
|
||||
|
||||
for info in ocr_info:
|
||||
# x1, y1, x2, y2
|
||||
bbox = info["bbox"]
|
||||
bbox[0] = int(bbox[0] * 1000.0 / width)
|
||||
bbox[2] = int(bbox[2] * 1000.0 / width)
|
||||
bbox[1] = int(bbox[1] * 1000.0 / height)
|
||||
bbox[3] = int(bbox[3] * 1000.0 / height)
|
||||
|
||||
text = info["text"]
|
||||
encode_res = tokenizer.encode(
|
||||
text, pad_to_max_seq_len=False, return_attention_mask=True)
|
||||
|
||||
if not add_special_ids:
|
||||
# TODO: use tok.all_special_ids to remove
|
||||
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
|
||||
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:-1]
|
||||
encode_res["attention_mask"] = encode_res["attention_mask"][1:-1]
|
||||
|
||||
input_ids_list.extend(encode_res["input_ids"])
|
||||
token_type_ids_list.extend(encode_res["token_type_ids"])
|
||||
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
|
||||
words_list.append(text)
|
||||
segment_offset_id.append(len(input_ids_list))
|
||||
|
||||
encoded_inputs = {
|
||||
"input_ids": input_ids_list,
|
||||
"token_type_ids": token_type_ids_list,
|
||||
"bbox": bbox_list,
|
||||
"attention_mask": [1] * len(input_ids_list),
|
||||
}
|
||||
|
||||
encoded_inputs = pad_sentences(
|
||||
tokenizer,
|
||||
encoded_inputs,
|
||||
max_seq_len=max_seq_len,
|
||||
return_attention_mask=return_attention_mask)
|
||||
|
||||
encoded_inputs = split_page(encoded_inputs)
|
||||
|
||||
fake_bs = encoded_inputs["input_ids"].shape[0]
|
||||
|
||||
encoded_inputs["image"] = paddle.to_tensor(img).unsqueeze(0).expand(
|
||||
[fake_bs] + list(img.shape))
|
||||
|
||||
encoded_inputs["segment_offset_id"] = segment_offset_id
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def postprocess(attention_mask, preds, id2label_map):
|
||||
if isinstance(preds, paddle.Tensor):
|
||||
preds = preds.numpy()
|
||||
preds = np.argmax(preds, axis=2)
|
||||
|
||||
preds_list = [[] for _ in range(preds.shape[0])]
|
||||
|
||||
# keep batch info
|
||||
for i in range(preds.shape[0]):
|
||||
for j in range(preds.shape[1]):
|
||||
if attention_mask[i][j] == 1:
|
||||
preds_list[i].append(id2label_map[preds[i][j]])
|
||||
|
||||
return preds_list
|
||||
|
||||
|
||||
def merge_preds_list_with_ocr_info(ocr_info, segment_offset_id, preds_list,
|
||||
label2id_map_for_draw):
|
||||
# must ensure the preds_list is generated from the same image
|
||||
preds = [p for pred in preds_list for p in pred]
|
||||
|
||||
id2label_map = dict()
|
||||
for key in label2id_map_for_draw:
|
||||
val = label2id_map_for_draw[key]
|
||||
if key == "O":
|
||||
id2label_map[val] = key
|
||||
if key.startswith("B-") or key.startswith("I-"):
|
||||
id2label_map[val] = key[2:]
|
||||
else:
|
||||
id2label_map[val] = key
|
||||
|
||||
for idx in range(len(segment_offset_id)):
|
||||
if idx == 0:
|
||||
start_id = 0
|
||||
else:
|
||||
start_id = segment_offset_id[idx - 1]
|
||||
|
||||
end_id = segment_offset_id[idx]
|
||||
|
||||
curr_pred = preds[start_id:end_id]
|
||||
curr_pred = [label2id_map_for_draw[p] for p in curr_pred]
|
||||
|
||||
if len(curr_pred) <= 0:
|
||||
pred_id = 0
|
||||
else:
|
||||
counts = np.bincount(curr_pred)
|
||||
pred_id = np.argmax(counts)
|
||||
ocr_info[idx]["pred_id"] = int(pred_id)
|
||||
ocr_info[idx]["pred"] = id2label_map[int(pred_id)]
|
||||
return ocr_info
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
# yapf: disable
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,)
|
||||
parser.add_argument("--train_data_dir", default=None, type=str, required=False,)
|
||||
parser.add_argument("--train_label_path", default=None, type=str, required=False,)
|
||||
parser.add_argument("--eval_data_dir", default=None, type=str, required=False,)
|
||||
parser.add_argument("--eval_label_path", default=None, type=str, required=False,)
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,)
|
||||
parser.add_argument("--max_seq_length", default=512, type=int,)
|
||||
parser.add_argument("--evaluate_during_training", action="store_true",)
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",)
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for eval.",)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.",)
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.",)
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.",)
|
||||
parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.",)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.",)
|
||||
parser.add_argument("--eval_steps", type=int, default=10, help="eval every X updates steps.",)
|
||||
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.",)
|
||||
parser.add_argument("--seed", type=int, default=2048, help="random seed for initialization",)
|
||||
|
||||
parser.add_argument("--ocr_rec_model_dir", default=None, type=str, )
|
||||
parser.add_argument("--ocr_det_model_dir", default=None, type=str, )
|
||||
parser.add_argument("--label_map_path", default="./labels/labels_ser.txt", type=str, required=False, )
|
||||
parser.add_argument("--infer_imgs", default=None, type=str, required=False)
|
||||
parser.add_argument("--ocr_json_path", default=None, type=str, required=False, help="ocr prediction results")
|
||||
# yapf: enable
|
||||
args = parser.parse_args()
|
||||
return args
|
|
@ -0,0 +1,442 @@
|
|||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import cv2
|
||||
import numpy as np
|
||||
import paddle
|
||||
import copy
|
||||
from paddle.io import Dataset
|
||||
|
||||
__all__ = ["XFUNDataset"]
|
||||
|
||||
|
||||
class XFUNDataset(Dataset):
|
||||
"""
|
||||
Example:
|
||||
print("=====begin to build dataset=====")
|
||||
from paddlenlp.transformers import LayoutXLMTokenizer
|
||||
tokenizer = LayoutXLMTokenizer.from_pretrained("/paddle/models/transformers/layoutxlm-base-paddle/")
|
||||
tok_res = tokenizer.tokenize("Maribyrnong")
|
||||
# res = tokenizer.convert_ids_to_tokens(val_data["input_ids"][0])
|
||||
dataset = XfunDatasetForSer(
|
||||
tokenizer,
|
||||
data_dir="./zh.val/",
|
||||
label_path="zh.val/xfun_normalize_val.json",
|
||||
img_size=(224,224))
|
||||
print(len(dataset))
|
||||
|
||||
data = dataset[0]
|
||||
print(data.keys())
|
||||
print("input_ids: ", data["input_ids"])
|
||||
print("labels: ", data["labels"])
|
||||
print("token_type_ids: ", data["token_type_ids"])
|
||||
print("words_list: ", data["words_list"])
|
||||
print("image shape: ", data["image"].shape)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
tokenizer,
|
||||
data_dir,
|
||||
label_path,
|
||||
contains_re=False,
|
||||
label2id_map=None,
|
||||
img_size=(224, 224),
|
||||
pad_token_label_id=None,
|
||||
add_special_ids=False,
|
||||
return_attention_mask=True,
|
||||
load_mode='all',
|
||||
max_seq_len=512):
|
||||
super().__init__()
|
||||
self.tokenizer = tokenizer
|
||||
self.data_dir = data_dir
|
||||
self.label_path = label_path
|
||||
self.contains_re = contains_re
|
||||
self.label2id_map = label2id_map
|
||||
self.img_size = img_size
|
||||
self.pad_token_label_id = pad_token_label_id
|
||||
self.add_special_ids = add_special_ids
|
||||
self.return_attention_mask = return_attention_mask
|
||||
self.load_mode = load_mode
|
||||
self.max_seq_len = max_seq_len
|
||||
|
||||
if self.pad_token_label_id is None:
|
||||
self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
|
||||
|
||||
self.all_lines = self.read_all_lines()
|
||||
|
||||
self.entities_labels = {'HEADER': 0, 'QUESTION': 1, 'ANSWER': 2}
|
||||
self.return_keys = {
|
||||
'bbox': 'np',
|
||||
'input_ids': 'np',
|
||||
'labels': 'np',
|
||||
'attention_mask': 'np',
|
||||
'image': 'np',
|
||||
'token_type_ids': 'np',
|
||||
'entities': 'dict',
|
||||
'relations': 'dict',
|
||||
}
|
||||
|
||||
if load_mode == "all":
|
||||
self.encoded_inputs_all = self._parse_label_file_all()
|
||||
|
||||
def pad_sentences(self,
|
||||
encoded_inputs,
|
||||
max_seq_len=512,
|
||||
pad_to_max_seq_len=True,
|
||||
return_attention_mask=True,
|
||||
return_token_type_ids=True,
|
||||
truncation_strategy="longest_first",
|
||||
return_overflowing_tokens=False,
|
||||
return_special_tokens_mask=False):
|
||||
# Padding
|
||||
needs_to_be_padded = pad_to_max_seq_len and \
|
||||
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_seq_len - len(encoded_inputs["input_ids"])
|
||||
if self.tokenizer.padding_side == 'right':
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"]) + [0] * difference
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = (
|
||||
encoded_inputs["token_type_ids"] +
|
||||
[self.tokenizer.pad_token_type_id] * difference)
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs[
|
||||
"special_tokens_mask"] + [1] * difference
|
||||
encoded_inputs["input_ids"] = encoded_inputs[
|
||||
"input_ids"] + [self.tokenizer.pad_token_id] * difference
|
||||
encoded_inputs["labels"] = encoded_inputs[
|
||||
"labels"] + [self.pad_token_label_id] * difference
|
||||
encoded_inputs["bbox"] = encoded_inputs[
|
||||
"bbox"] + [[0, 0, 0, 0]] * difference
|
||||
elif self.tokenizer.padding_side == 'left':
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [0] * difference + [
|
||||
1
|
||||
] * len(encoded_inputs["input_ids"])
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = (
|
||||
[self.tokenizer.pad_token_type_id] * difference +
|
||||
encoded_inputs["token_type_ids"])
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = [
|
||||
1
|
||||
] * difference + encoded_inputs["special_tokens_mask"]
|
||||
encoded_inputs["input_ids"] = [
|
||||
self.tokenizer.pad_token_id
|
||||
] * difference + encoded_inputs["input_ids"]
|
||||
encoded_inputs["labels"] = [
|
||||
self.pad_token_label_id
|
||||
] * difference + encoded_inputs["labels"]
|
||||
encoded_inputs["bbox"] = [
|
||||
[0, 0, 0, 0]
|
||||
] * difference + encoded_inputs["bbox"]
|
||||
else:
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
|
||||
"input_ids"])
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
def truncate_inputs(self, encoded_inputs, max_seq_len=512):
|
||||
for key in encoded_inputs:
|
||||
if key == "sample_id":
|
||||
continue
|
||||
length = min(len(encoded_inputs[key]), max_seq_len)
|
||||
encoded_inputs[key] = encoded_inputs[key][:length]
|
||||
return encoded_inputs
|
||||
|
||||
def read_all_lines(self, ):
|
||||
with open(self.label_path, "r") as fin:
|
||||
lines = fin.readlines()
|
||||
return lines
|
||||
|
||||
def _parse_label_file_all(self):
|
||||
"""
|
||||
parse all samples
|
||||
"""
|
||||
encoded_inputs_all = []
|
||||
for line in self.all_lines:
|
||||
encoded_inputs_all.extend(self._parse_label_file(line))
|
||||
return encoded_inputs_all
|
||||
|
||||
def _parse_label_file(self, line):
|
||||
"""
|
||||
parse single sample
|
||||
"""
|
||||
|
||||
image_name, info_str = line.split("\t")
|
||||
image_path = os.path.join(self.data_dir, image_name)
|
||||
|
||||
def add_imgge_path(x):
|
||||
x['image_path'] = image_path
|
||||
return x
|
||||
|
||||
encoded_inputs = self._read_encoded_inputs_sample(info_str)
|
||||
if self.contains_re:
|
||||
encoded_inputs = self._chunk_re(encoded_inputs)
|
||||
else:
|
||||
encoded_inputs = self._chunk_ser(encoded_inputs)
|
||||
encoded_inputs = list(map(add_imgge_path, encoded_inputs))
|
||||
return encoded_inputs
|
||||
|
||||
def _read_encoded_inputs_sample(self, info_str):
|
||||
"""
|
||||
parse label info
|
||||
"""
|
||||
# read text info
|
||||
info_dict = json.loads(info_str)
|
||||
height = info_dict["height"]
|
||||
width = info_dict["width"]
|
||||
|
||||
words_list = []
|
||||
bbox_list = []
|
||||
input_ids_list = []
|
||||
token_type_ids_list = []
|
||||
gt_label_list = []
|
||||
|
||||
if self.contains_re:
|
||||
# for re
|
||||
entities = []
|
||||
relations = []
|
||||
id2label = {}
|
||||
entity_id_to_index_map = {}
|
||||
empty_entity = set()
|
||||
for info in info_dict["ocr_info"]:
|
||||
if self.contains_re:
|
||||
# for re
|
||||
if len(info["text"]) == 0:
|
||||
empty_entity.add(info["id"])
|
||||
continue
|
||||
id2label[info["id"]] = info["label"]
|
||||
relations.extend([tuple(sorted(l)) for l in info["linking"]])
|
||||
|
||||
# x1, y1, x2, y2
|
||||
bbox = info["bbox"]
|
||||
label = info["label"]
|
||||
bbox[0] = int(bbox[0] * 1000.0 / width)
|
||||
bbox[2] = int(bbox[2] * 1000.0 / width)
|
||||
bbox[1] = int(bbox[1] * 1000.0 / height)
|
||||
bbox[3] = int(bbox[3] * 1000.0 / height)
|
||||
|
||||
text = info["text"]
|
||||
encode_res = self.tokenizer.encode(
|
||||
text, pad_to_max_seq_len=False, return_attention_mask=True)
|
||||
|
||||
gt_label = []
|
||||
if not self.add_special_ids:
|
||||
# TODO: use tok.all_special_ids to remove
|
||||
encode_res["input_ids"] = encode_res["input_ids"][1:-1]
|
||||
encode_res["token_type_ids"] = encode_res["token_type_ids"][1:
|
||||
-1]
|
||||
encode_res["attention_mask"] = encode_res["attention_mask"][1:
|
||||
-1]
|
||||
if label.lower() == "other":
|
||||
gt_label.extend([0] * len(encode_res["input_ids"]))
|
||||
else:
|
||||
gt_label.append(self.label2id_map[("b-" + label).upper()])
|
||||
gt_label.extend([self.label2id_map[("i-" + label).upper()]] *
|
||||
(len(encode_res["input_ids"]) - 1))
|
||||
if self.contains_re:
|
||||
if gt_label[0] != self.label2id_map["O"]:
|
||||
entity_id_to_index_map[info["id"]] = len(entities)
|
||||
entities.append({
|
||||
"start": len(input_ids_list),
|
||||
"end":
|
||||
len(input_ids_list) + len(encode_res["input_ids"]),
|
||||
"label": label.upper(),
|
||||
})
|
||||
input_ids_list.extend(encode_res["input_ids"])
|
||||
token_type_ids_list.extend(encode_res["token_type_ids"])
|
||||
bbox_list.extend([bbox] * len(encode_res["input_ids"]))
|
||||
gt_label_list.extend(gt_label)
|
||||
words_list.append(text)
|
||||
|
||||
encoded_inputs = {
|
||||
"input_ids": input_ids_list,
|
||||
"labels": gt_label_list,
|
||||
"token_type_ids": token_type_ids_list,
|
||||
"bbox": bbox_list,
|
||||
"attention_mask": [1] * len(input_ids_list),
|
||||
# "words_list": words_list,
|
||||
}
|
||||
encoded_inputs = self.pad_sentences(
|
||||
encoded_inputs,
|
||||
max_seq_len=self.max_seq_len,
|
||||
return_attention_mask=self.return_attention_mask)
|
||||
encoded_inputs = self.truncate_inputs(encoded_inputs)
|
||||
|
||||
if self.contains_re:
|
||||
relations = self._relations(entities, relations, id2label,
|
||||
empty_entity, entity_id_to_index_map)
|
||||
encoded_inputs['relations'] = relations
|
||||
encoded_inputs['entities'] = entities
|
||||
return encoded_inputs
|
||||
|
||||
def _chunk_ser(self, encoded_inputs):
|
||||
encoded_inputs_all = []
|
||||
seq_len = len(encoded_inputs['input_ids'])
|
||||
chunk_size = 512
|
||||
for chunk_id, index in enumerate(range(0, seq_len, chunk_size)):
|
||||
chunk_beg = index
|
||||
chunk_end = min(index + chunk_size, seq_len)
|
||||
encoded_inputs_example = {}
|
||||
for key in encoded_inputs:
|
||||
encoded_inputs_example[key] = encoded_inputs[key][chunk_beg:
|
||||
chunk_end]
|
||||
|
||||
encoded_inputs_all.append(encoded_inputs_example)
|
||||
return encoded_inputs_all
|
||||
|
||||
def _chunk_re(self, encoded_inputs):
|
||||
# prepare data
|
||||
entities = encoded_inputs.pop('entities')
|
||||
relations = encoded_inputs.pop('relations')
|
||||
encoded_inputs_all = []
|
||||
chunk_size = 512
|
||||
for chunk_id, index in enumerate(
|
||||
range(0, len(encoded_inputs["input_ids"]), chunk_size)):
|
||||
item = {}
|
||||
for k in encoded_inputs:
|
||||
item[k] = encoded_inputs[k][index:index + chunk_size]
|
||||
|
||||
# select entity in current chunk
|
||||
entities_in_this_span = []
|
||||
global_to_local_map = {} #
|
||||
for entity_id, entity in enumerate(entities):
|
||||
if (index <= entity["start"] < index + chunk_size and
|
||||
index <= entity["end"] < index + chunk_size):
|
||||
entity["start"] = entity["start"] - index
|
||||
entity["end"] = entity["end"] - index
|
||||
global_to_local_map[entity_id] = len(entities_in_this_span)
|
||||
entities_in_this_span.append(entity)
|
||||
|
||||
# select relations in current chunk
|
||||
relations_in_this_span = []
|
||||
for relation in relations:
|
||||
if (index <= relation["start_index"] < index + chunk_size and
|
||||
index <= relation["end_index"] < index + chunk_size):
|
||||
relations_in_this_span.append({
|
||||
"head": global_to_local_map[relation["head"]],
|
||||
"tail": global_to_local_map[relation["tail"]],
|
||||
"start_index": relation["start_index"] - index,
|
||||
"end_index": relation["end_index"] - index,
|
||||
})
|
||||
item.update({
|
||||
"entities": reformat(entities_in_this_span),
|
||||
"relations": reformat(relations_in_this_span),
|
||||
})
|
||||
item['entities']['label'] = [
|
||||
self.entities_labels[x] for x in item['entities']['label']
|
||||
]
|
||||
encoded_inputs_all.append(item)
|
||||
return encoded_inputs_all
|
||||
|
||||
def _relations(self, entities, relations, id2label, empty_entity,
|
||||
entity_id_to_index_map):
|
||||
"""
|
||||
build relations
|
||||
"""
|
||||
relations = list(set(relations))
|
||||
relations = [
|
||||
rel for rel in relations
|
||||
if rel[0] not in empty_entity and rel[1] not in empty_entity
|
||||
]
|
||||
kv_relations = []
|
||||
for rel in relations:
|
||||
pair = [id2label[rel[0]], id2label[rel[1]]]
|
||||
if pair == ["question", "answer"]:
|
||||
kv_relations.append({
|
||||
"head": entity_id_to_index_map[rel[0]],
|
||||
"tail": entity_id_to_index_map[rel[1]]
|
||||
})
|
||||
elif pair == ["answer", "question"]:
|
||||
kv_relations.append({
|
||||
"head": entity_id_to_index_map[rel[1]],
|
||||
"tail": entity_id_to_index_map[rel[0]]
|
||||
})
|
||||
else:
|
||||
continue
|
||||
relations = sorted(
|
||||
[{
|
||||
"head": rel["head"],
|
||||
"tail": rel["tail"],
|
||||
"start_index": get_relation_span(rel, entities)[0],
|
||||
"end_index": get_relation_span(rel, entities)[1],
|
||||
} for rel in kv_relations],
|
||||
key=lambda x: x["head"], )
|
||||
return relations
|
||||
|
||||
def load_img(self, image_path):
|
||||
# read img
|
||||
img = cv2.imread(image_path)
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
resize_h, resize_w = self.img_size
|
||||
im_shape = img.shape[0:2]
|
||||
im_scale_y = resize_h / im_shape[0]
|
||||
im_scale_x = resize_w / im_shape[1]
|
||||
img_new = cv2.resize(
|
||||
img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=2)
|
||||
mean = np.array([0.485, 0.456, 0.406])[np.newaxis, np.newaxis, :]
|
||||
std = np.array([0.229, 0.224, 0.225])[np.newaxis, np.newaxis, :]
|
||||
img_new = img_new / 255.0
|
||||
img_new -= mean
|
||||
img_new /= std
|
||||
img = img_new.transpose((2, 0, 1))
|
||||
return img
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if self.load_mode == "all":
|
||||
data = copy.deepcopy(self.encoded_inputs_all[idx])
|
||||
else:
|
||||
data = self._parse_label_file(self.all_lines[idx])[0]
|
||||
|
||||
image_path = data.pop('image_path')
|
||||
data["image"] = self.load_img(image_path)
|
||||
|
||||
return_data = {}
|
||||
for k, v in data.items():
|
||||
if k in self.return_keys:
|
||||
if self.return_keys[k] == 'np':
|
||||
v = np.array(v)
|
||||
return_data[k] = v
|
||||
return return_data
|
||||
|
||||
def __len__(self, ):
|
||||
if self.load_mode == "all":
|
||||
return len(self.encoded_inputs_all)
|
||||
else:
|
||||
return len(self.all_lines)
|
||||
|
||||
|
||||
def get_relation_span(rel, entities):
|
||||
bound = []
|
||||
for entity_index in [rel["head"], rel["tail"]]:
|
||||
bound.append(entities[entity_index]["start"])
|
||||
bound.append(entities[entity_index]["end"])
|
||||
return min(bound), max(bound)
|
||||
|
||||
|
||||
def reformat(data):
|
||||
new_data = {}
|
||||
for item in data:
|
||||
for k, v in item.items():
|
||||
if k not in new_data:
|
||||
new_data[k] = []
|
||||
new_data[k].append(v)
|
||||
return new_data
|
|
@ -30,6 +30,7 @@ function func_set_params(){
|
|||
|
||||
function func_parser_params(){
|
||||
strs=$1
|
||||
MODE=$2
|
||||
IFS=":"
|
||||
array=(${strs})
|
||||
key=${array[0]}
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
===========================ch_PP-OCRv2===========================
|
||||
model_name:ch_PP-OCRv2
|
||||
python:python3.7
|
||||
infer_model:./inference/ch_PP-OCRv2_det_infer/
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_system.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:False|True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
--rec_model_dir:./inference/ch_PP-OCRv2_rec_infer/
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn system
|
||||
runtime_device:ARM_CPU
|
||||
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
|
||||
rec_infer_model:ch_PP-OCRv2_rec_infer|ch_PP-OCRv2_rec_slim_quant_infer
|
||||
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
--rec_batch_size:1
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
--rec_dict_dir:./ppocr_keys_v1.txt
|
||||
--benchmark:True
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn system
|
||||
runtime_device:ARM_GPU_OPENCL
|
||||
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
|
||||
rec_infer_model:ch_PP-OCRv2_rec_infer|ch_PP-OCRv2_rec_slim_quant_infer
|
||||
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
--rec_batch_size:1
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
--rec_dict_dir:./ppocr_keys_v1.txt
|
||||
--benchmark:True
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn det
|
||||
runtime_device:ARM_CPU
|
||||
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
|
||||
null:null
|
||||
null:null
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
null:null
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
null:null
|
||||
--benchmark:True
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn det
|
||||
runtime_device:ARM_GPU_OPENCL
|
||||
det_infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
|
||||
null:null
|
||||
null:null
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
null:null
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
null:null
|
||||
--benchmark:True
|
|
@ -1,20 +1,20 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPOCRv2_ocr_det
|
||||
model_name:ch_PPOCRv2_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_infer=1|whole_train_infer=500
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train|pact_train
|
||||
norm_train:tools/train.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
|
||||
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
|
||||
norm_train:tools/train.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
|
@ -27,8 +27,8 @@ null:null
|
|||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCR_det_cml.yml -o
|
||||
norm_export:tools/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
fpgm_export:
|
||||
distill_export:null
|
||||
export1:null
|
|
@ -0,0 +1,21 @@
|
|||
===========================kl_quant_params===========================
|
||||
model_name:PPOCRv2_ocr_det_kl
|
||||
python:python3.7
|
||||
Global.pretrained_model:null
|
||||
Global.save_inference_dir:null
|
||||
infer_model:./inference/ch_PP-OCRv2_det_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPOCRv2_ocr_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:pact_train
|
||||
norm_train:null
|
||||
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:null
|
||||
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml -o
|
||||
fpgm_export:
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:Student
|
||||
infer_model:./inference/ch_PP-OCRv2_det_infer/
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,159 @@
|
|||
Global:
|
||||
debug: false
|
||||
use_gpu: true
|
||||
epoch_num: 800
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_pp-OCRv2_distillation
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: true
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: false
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 25
|
||||
infer_mode: false
|
||||
use_space_char: true
|
||||
distributed: true
|
||||
save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Piecewise
|
||||
decay_epochs : [700, 800]
|
||||
values : [0.001, 0.0001]
|
||||
warmup_epoch: 5
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 2.0e-05
|
||||
|
||||
Architecture:
|
||||
model_type: &model_type "rec"
|
||||
name: DistillationModel
|
||||
algorithm: Distillation
|
||||
Models:
|
||||
Teacher:
|
||||
pretrained:
|
||||
freeze_params: false
|
||||
return_all_feats: true
|
||||
model_type: *model_type
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV1Enhance
|
||||
scale: 0.5
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 64
|
||||
Head:
|
||||
name: CTCHead
|
||||
mid_channels: 96
|
||||
fc_decay: 0.00002
|
||||
Student:
|
||||
pretrained:
|
||||
freeze_params: false
|
||||
return_all_feats: true
|
||||
model_type: *model_type
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV1Enhance
|
||||
scale: 0.5
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 64
|
||||
Head:
|
||||
name: CTCHead
|
||||
mid_channels: 96
|
||||
fc_decay: 0.00002
|
||||
|
||||
|
||||
Loss:
|
||||
name: CombinedLoss
|
||||
loss_config_list:
|
||||
- DistillationCTCLoss:
|
||||
weight: 1.0
|
||||
model_name_list: ["Student", "Teacher"]
|
||||
key: head_out
|
||||
- DistillationDMLLoss:
|
||||
weight: 1.0
|
||||
act: "softmax"
|
||||
use_log: true
|
||||
model_name_pairs:
|
||||
- ["Student", "Teacher"]
|
||||
key: head_out
|
||||
- DistillationDistanceLoss:
|
||||
weight: 1.0
|
||||
mode: "l2"
|
||||
model_name_pairs:
|
||||
- ["Student", "Teacher"]
|
||||
key: backbone_out
|
||||
|
||||
PostProcess:
|
||||
name: DistillationCTCLabelDecode
|
||||
model_name: ["Student", "Teacher"]
|
||||
key: head_out
|
||||
|
||||
Metric:
|
||||
name: DistillationMetric
|
||||
base_metric_name: RecMetric
|
||||
main_indicator: acc
|
||||
key: "Student"
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/ic15_data/
|
||||
label_file_list:
|
||||
- ./train_data/ic15_data/rec_gt_train.txt
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
img_mode: BGR
|
||||
channel_first: false
|
||||
- RecAug:
|
||||
- CTCLabelEncode:
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys:
|
||||
- image
|
||||
- label
|
||||
- length
|
||||
loader:
|
||||
shuffle: true
|
||||
batch_size_per_card: 128
|
||||
drop_last: true
|
||||
num_sections: 1
|
||||
num_workers: 8
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/ic15_data
|
||||
label_file_list:
|
||||
- ./train_data/ic15_data/rec_gt_test.txt
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
img_mode: BGR
|
||||
channel_first: false
|
||||
- CTCLabelEncode:
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys:
|
||||
- image
|
||||
- label
|
||||
- length
|
||||
loader:
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
batch_size_per_card: 128
|
||||
num_workers: 8
|
|
@ -0,0 +1,53 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPOCRv2_ocr_rec
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./inference/rec_inference
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
|
||||
quant_export:
|
||||
fpgm_export:
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:Student
|
||||
infer_model:./inference/ch_PP-OCRv2_rec_infer/
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:/inference/rec_inference
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
===========================kl_quant_params===========================
|
||||
model_name:PPOCRv2_ocr_rec_kl
|
||||
python:python3.7
|
||||
Global.pretrained_model:null
|
||||
Global.save_inference_dir:null
|
||||
infer_model:./inference/ch_PP-OCRv2_rec_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:False|True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:True
|
||||
--precision:int8
|
||||
--rec_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,53 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPOCRv2_ocr_rec_pact
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./inference/rec_inference
|
||||
null:null
|
||||
##
|
||||
trainer:pact_train
|
||||
norm_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ch_PP-OCRv2_rec/ch_PP-OCRv2_rec_distillation.yml -o
|
||||
quant_export:
|
||||
fpgm_export:
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:Student
|
||||
infer_model:./inference/ch_PP-OCRv2_rec_infer/
|
||||
infer_export:null
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:/inference/rec_inference
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:fpgm_train
|
||||
norm_train:null
|
||||
pact_train:null
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:null
|
||||
quant_export:null
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:null
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,19 @@
|
|||
===========================ch_ppocr_mobile_v2.0===========================
|
||||
model_name:ch_ppocr_mobile_v2.0
|
||||
python:python3.7
|
||||
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_system.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:False|True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
--rec_model_dir:./inference/ch_ppocr_mobile_v2.0_rec_infer/
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn system
|
||||
runtime_device:ARM_CPU
|
||||
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
|
||||
rec_infer_model:ch_ppocr_mobile_v2.0_rec_infer|ch_ppocr_mobile_v2.0_rec_slim_infer
|
||||
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
--rec_batch_size:1
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
--rec_dict_dir:./ppocr_keys_v1.txt
|
||||
--benchmark:True
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn system
|
||||
runtime_device:ARM_GPU_OPENCL
|
||||
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
|
||||
rec_infer_model:ch_ppocr_mobile_v2.0_rec_infer|ch_ppocr_mobile_v2.0_rec_slim_infer
|
||||
cls_infer_model:ch_ppocr_mobile_v2.0_cls_infer|ch_ppocr_mobile_v2.0_cls_slim_infer
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
--rec_batch_size:1
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
--rec_dict_dir:./ppocr_keys_v1.txt
|
||||
--benchmark:True
|
|
@ -1,12 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn det
|
||||
infer_model:ch_PP-OCRv2_det_infer|ch_PP-OCRv2_det_slim_quant_infer
|
||||
runtime_device:ARM_CPU
|
||||
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
|
||||
null:null
|
||||
null:null
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
--rec_batch_size:1
|
||||
--system_batch_size:1
|
||||
null:null
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
--rec_dict_dir:./ppocr_keys_v1.txt
|
||||
null:null
|
||||
--benchmark:True
|
|
@ -0,0 +1,13 @@
|
|||
===========================lite_params===========================
|
||||
inference:./ocr_db_crnn det
|
||||
runtime_device:ARM_GPU_OPENCL
|
||||
det_infer_model:ch_ppocr_mobile_v2.0_det_infer|ch_ppocr_db_mobile_v2.0_det_quant_infer
|
||||
null:null
|
||||
null:null
|
||||
--cpu_threads:1|4
|
||||
--det_batch_size:1
|
||||
null:null
|
||||
--image_dir:./test_data/icdar2015_lite/text_localization/ch4_test_images/
|
||||
--config_dir:./config.txt
|
||||
null:null
|
||||
--benchmark:True
|
|
@ -15,4 +15,4 @@ op.det.local_service_conf.thread_num:1|6
|
|||
op.det.local_service_conf.use_trt:False|True
|
||||
op.det.local_service_conf.precision:fp32|fp16|int8
|
||||
pipline:pipeline_rpc_client.py|pipeline_http_client.py
|
||||
--image_dir:../../doc/imgs
|
||||
--image_dir:../../doc/imgs
|
|
@ -1,10 +1,10 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
model_name:ch_ppocr_mobile_v2.0_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
|
@ -12,10 +12,10 @@ train_model_name:latest
|
|||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train|pact_train|fpgm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
|
@ -27,9 +27,9 @@ null:null
|
|||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
|
@ -13,9 +13,9 @@ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
|||
null:null
|
||||
##
|
||||
trainer:norm_train|pact_train|fpgm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
|
@ -27,9 +27,9 @@ null:null
|
|||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
|
@ -13,9 +13,9 @@ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
|||
null:null
|
||||
##
|
||||
trainer:norm_train|pact_train|fpgm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
norm_train:tools/train.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/det_mv3_db_v2.0_train/best_accuracy
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
|
@ -27,9 +27,9 @@ null:null
|
|||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/det_mv3_db.yml -o
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
quant_export:deploy/slim/quantization/export_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ppocr_det_mobile/det_mv3_db.yml -o
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
python:python
|
||||
gpu_list:-1
|
||||
Global.use_gpu:False
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy
|
||||
infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:False
|
||||
--enable_mkldnn:False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False
|
||||
--precision:fp32
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,52 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
python:python
|
||||
gpu_list:0
|
||||
Global.use_gpu:True
|
||||
Global.auto_cast:fp32|amp
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:./inference/ch_ppocr_mobile_v2.0_det_train/best_accuracy
|
||||
infer_export:tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
===========================kl_quant_params===========================
|
||||
model_name:ch_ppocr_mobile_v2.0_det_KL
|
||||
python:python3.7
|
||||
Global.pretrained_model:null
|
||||
Global.save_inference_dir:null
|
||||
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,17 @@
|
|||
===========================kl_quant_params===========================
|
||||
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:False
|
||||
--enable_mkldnn:False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False
|
||||
--precision:int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,17 @@
|
|||
===========================kl_quant_params===========================
|
||||
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:False
|
||||
--enable_mkldnn:False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False
|
||||
--precision:int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:pact_train
|
||||
norm_train:null
|
||||
pact_train:deploy/slim/quantization/quant.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:null
|
||||
quant_export:deploy/slim/quantization/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:null
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ch_ppocr_mobile_v2.0_rec
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_infer=2|whole_train_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_infer=128|whole_train_infer=128
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./inference/rec_inference
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c configs/rec/rec_icdar15_train.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c configs/rec/rec_icdar15_train.yml -o
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c configs/rec/rec_icdar15_train.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
##
|
||||
infer_model:null
|
||||
infer_export:tools/export_model.py -c configs/rec/rec_icdar15_train.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE"
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:True|False
|
||||
--precision:fp32|fp16|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
--save_log_path:./test/output/
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,102 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_chinese_lite_v2.0
|
||||
save_epoch_step: 3
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.001
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
small_stride: [1, 2, 2, 2]
|
||||
disable_se: True
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 48
|
||||
Head:
|
||||
name: CTCHead
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- RecAug:
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 256
|
||||
drop_last: True
|
||||
num_workers: 8
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 8
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ch_ppocr_mobile_v2.0_rec_FPGM
|
||||
python:python3.7
|
||||
gpu_list:0
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/ic15_data/test/word_1.png
|
||||
null:null
|
||||
##
|
||||
trainer:fpgm_train
|
||||
norm_train:null
|
||||
pact_train:null
|
||||
fpgm_train:deploy/slim/prune/sensitivity_anal.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./pretrain_models/ch_ppocr_mobile_v2.0_rec_train/best_accuracy
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:null
|
||||
quant_export:null
|
||||
fpgm_export:deploy/slim/prune/export_prune_model.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_FPGM/rec_chinese_lite_train_v2.0.yml -o
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:null
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,21 @@
|
|||
===========================kl_quant_params===========================
|
||||
model_name:ch_ppocr_mobile_v2.0_rec_KL
|
||||
python:python3.7
|
||||
Global.pretrained_model:null
|
||||
Global.save_inference_dir:null
|
||||
infer_model:./inference/ch_ppocr_mobile_v2.0_rec_infer/
|
||||
infer_export:deploy/slim/quantization/quant_kl.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_KL/rec_chinese_lite_train_v2.0.yml -o
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -0,0 +1,101 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_chinese_lite_v2.0
|
||||
save_epoch_step: 3
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.001
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
small_stride: [1, 2, 2, 2]
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 48
|
||||
Head:
|
||||
name: CTCHead
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- RecAug:
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 256
|
||||
drop_last: True
|
||||
num_workers: 8
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 8
|
|
@ -0,0 +1,101 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 500
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_chinese_lite_v2.0
|
||||
save_epoch_step: 3
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: True
|
||||
pretrained_model:
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
# for data or label process
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 25
|
||||
infer_mode: False
|
||||
use_space_char: True
|
||||
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
|
||||
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.001
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.00001
|
||||
|
||||
Architecture:
|
||||
model_type: rec
|
||||
algorithm: CRNN
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: small
|
||||
small_stride: [1, 2, 2, 2]
|
||||
Neck:
|
||||
name: SequenceEncoder
|
||||
encoder_type: rnn
|
||||
hidden_size: 48
|
||||
Head:
|
||||
name: CTCHead
|
||||
fc_decay: 0.00001
|
||||
|
||||
Loss:
|
||||
name: CTCLoss
|
||||
|
||||
PostProcess:
|
||||
name: CTCLabelDecode
|
||||
|
||||
Metric:
|
||||
name: RecMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_train.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- RecAug:
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
batch_size_per_card: 256
|
||||
drop_last: True
|
||||
num_workers: 8
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: train_data/ic15_data
|
||||
label_file_list: ["train_data/ic15_data/rec_gt_test.txt"]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- CTCLabelEncode: # Class handling label
|
||||
- RecResizeImg:
|
||||
image_shape: [3, 32, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 256
|
||||
num_workers: 8
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ch_ppocr_mobile_v2.0_rec_PACT
|
||||
python:python3.7
|
||||
gpu_list:0
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
|
||||
Global.checkpoints:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/ic15_data/test/word_1.png
|
||||
null:null
|
||||
##
|
||||
trainer:pact_train
|
||||
norm_train:null
|
||||
pact_train:deploy/slim/quantization/quant.py -c test_tipc/configs/ch_ppocr_mobile_v2.0_rec_PACT/rec_chinese_lite_train_v2.0.yml -o
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.checkpoints:
|
||||
norm_export:null
|
||||
quant_export:deploy/slim/quantization/export_model.py -ctest_tipc/configs/ch_ppocr_mobile_v2.0_rec_PACT/rec_chinese_lite_train_v2.0.yml -o
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:null
|
||||
infer_export:null
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ppocr_keys_v1.txt --rec_image_shape="3,32,100"
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
--save_log_path:./test/output/
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,19 @@
|
|||
===========================ch_ppocr_server_v2.0===========================
|
||||
model_name:ch_ppocr_server_v2.0
|
||||
python:python3.7
|
||||
infer_model:./inference/ch_ppocr_server_v2.0_det_infer/
|
||||
infer_export:null
|
||||
infer_quant:True
|
||||
inference:tools/infer/predict_system.py
|
||||
--use_gpu:False|True
|
||||
--enable_mkldnn:False|True
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False
|
||||
--precision:fp32
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
--rec_model_dir:./inference/ch_ppocr_server_v2.0_rec_infer/
|
||||
--benchmark:True
|
||||
null:null
|
||||
null:null
|
|
@ -1,12 +1,12 @@
|
|||
===========================train_params===========================
|
||||
model_name:ocr_server_det
|
||||
model_name:ch_ppocr_server_v2.0_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_infer=2|whole_train_infer=300
|
||||
Global.epoch_num:lite_train_lite_infer=2|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_lite_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:ch_ppocr_server_v2.0_rec
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=5|whole_train_whole_infer=100
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=128|whole_train_whole_infer=128
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./inference/rec_inference
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
##
|
||||
infer_model:null
|
||||
infer_export:tools/export_model.py -c test_tipc/configs/ch_ppocr_server_v2.0_rec/rec_icdar15_train.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_rec.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1|6
|
||||
--use_tensorrt:True|False
|
||||
--precision:fp32|int8
|
||||
--rec_model_dir:
|
||||
--image_dir:./inference/rec_inference
|
||||
--save_log_path:./test/output/
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:det_mv3_db_v2.0
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
inference_dir:null
|
||||
train_model:./inference/det_mv3_db_v2.0_train/best_accuracy
|
||||
infer_export:tools/export_model.py -c configs/det/det_mv3_db.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
null:null
|
||||
--benchmark:True
|
||||
null:null
|
|
@ -0,0 +1,109 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 10000
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 2
|
||||
save_model_dir: ./output/east_mv3/
|
||||
save_epoch_step: 1000
|
||||
# evaluation is run every 5000 iterations after the 4000th iteration
|
||||
eval_batch_step: [4000, 5000]
|
||||
cal_metric_during_train: False
|
||||
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
checkpoints:
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img:
|
||||
save_res_path: ./output/det_east/predicts_east.txt
|
||||
|
||||
Architecture:
|
||||
model_type: det
|
||||
algorithm: EAST
|
||||
Transform:
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
Neck:
|
||||
name: EASTFPN
|
||||
model_name: small
|
||||
Head:
|
||||
name: EASTHead
|
||||
model_name: small
|
||||
|
||||
Loss:
|
||||
name: EASTLoss
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
# name: Cosine
|
||||
learning_rate: 0.001
|
||||
# warmup_epoch: 0
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0
|
||||
|
||||
PostProcess:
|
||||
name: EASTPostProcess
|
||||
score_thresh: 0.8
|
||||
cover_thresh: 0.1
|
||||
nms_thresh: 0.2
|
||||
|
||||
Metric:
|
||||
name: DetMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_list:
|
||||
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
|
||||
ratio_list: [1.0]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- EASTProcessTrain:
|
||||
image_shape: [512, 512]
|
||||
background_ratio: 0.125
|
||||
min_crop_side_ratio: 0.1
|
||||
min_text_size: 10
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'score_map', 'geo_map', 'training_mask'] # dataloader will return list in this order
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 16
|
||||
num_workers: 8
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_list:
|
||||
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- DetResizeForTest:
|
||||
limit_side_len: 2400
|
||||
limit_type: max
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
keep_keys: ['image', 'shape', 'polys', 'ignore_tags']
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 1 # must be 1
|
||||
num_workers: 2
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:det_mv3_east_v2.0
|
||||
python:python3.7
|
||||
gpu_list:0
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_east_v2.0/det_mv3_east.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
##
|
||||
train_model:./inference/det_mv3_east/best_accuracy
|
||||
infer_export:tools/export_model.py -c test_tipc/cconfigs/det_mv3_east_v2.0/det_mv3_east.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
--save_log_path:null
|
||||
--benchmark:True
|
||||
--det_algorithm:EAST
|
|
@ -0,0 +1,135 @@
|
|||
Global:
|
||||
use_gpu: true
|
||||
epoch_num: 600
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/det_mv3_pse/
|
||||
save_epoch_step: 600
|
||||
# evaluation is run every 63 iterations
|
||||
eval_batch_step: [ 0,1000 ]
|
||||
cal_metric_during_train: False
|
||||
pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained
|
||||
checkpoints: #./output/det_r50_vd_pse_batch8_ColorJitter/best_accuracy
|
||||
save_inference_dir:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_en/img_10.jpg
|
||||
save_res_path: ./output/det_pse/predicts_pse.txt
|
||||
|
||||
Architecture:
|
||||
model_type: det
|
||||
algorithm: PSE
|
||||
Transform: null
|
||||
Backbone:
|
||||
name: MobileNetV3
|
||||
scale: 0.5
|
||||
model_name: large
|
||||
Neck:
|
||||
name: FPN
|
||||
out_channels: 96
|
||||
Head:
|
||||
name: PSEHead
|
||||
hidden_dim: 96
|
||||
out_channels: 7
|
||||
|
||||
Loss:
|
||||
name: PSELoss
|
||||
alpha: 0.7
|
||||
ohem_ratio: 3
|
||||
kernel_sample_mask: pred
|
||||
reduction: none
|
||||
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Step
|
||||
learning_rate: 0.001
|
||||
step_size: 200
|
||||
gamma: 0.1
|
||||
regularizer:
|
||||
name: 'L2'
|
||||
factor: 0.0005
|
||||
|
||||
PostProcess:
|
||||
name: PSEPostProcess
|
||||
thresh: 0
|
||||
box_thresh: 0.85
|
||||
min_area: 16
|
||||
box_type: box # 'box' or 'poly'
|
||||
scale: 1
|
||||
|
||||
Metric:
|
||||
name: DetMetric
|
||||
main_indicator: hmean
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_list:
|
||||
- ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- ColorJitter:
|
||||
brightness: 0.12549019607843137
|
||||
saturation: 0.5
|
||||
- IaaAugment:
|
||||
augmenter_args:
|
||||
- { 'type': Resize, 'args': { 'size': [ 0.5, 3 ] } }
|
||||
- { 'type': Fliplr, 'args': { 'p': 0.5 } }
|
||||
- { 'type': Affine, 'args': { 'rotate': [ -10, 10 ] } }
|
||||
- MakePseGt:
|
||||
kernel_num: 7
|
||||
min_shrink_ratio: 0.4
|
||||
size: 640
|
||||
- RandomCropImgMask:
|
||||
size: [ 640,640 ]
|
||||
main_key: gt_text
|
||||
crop_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ]
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
keep_keys: [ 'image', 'gt_text', 'gt_kernels', 'mask' ] # the order of the dataloader list
|
||||
loader:
|
||||
shuffle: True
|
||||
drop_last: False
|
||||
batch_size_per_card: 16
|
||||
num_workers: 8
|
||||
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data/icdar2015/text_localization/
|
||||
label_file_list:
|
||||
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
||||
ratio_list: [ 1.0 ]
|
||||
transforms:
|
||||
- DecodeImage: # load image
|
||||
img_mode: BGR
|
||||
channel_first: False
|
||||
- DetLabelEncode: # Class handling label
|
||||
- DetResizeForTest:
|
||||
limit_side_len: 736
|
||||
limit_type: min
|
||||
- NormalizeImage:
|
||||
scale: 1./255.
|
||||
mean: [ 0.485, 0.456, 0.406 ]
|
||||
std: [ 0.229, 0.224, 0.225 ]
|
||||
order: 'hwc'
|
||||
- ToCHWImage:
|
||||
- KeepKeys:
|
||||
keep_keys: [ 'image', 'shape', 'polys', 'ignore_tags' ]
|
||||
loader:
|
||||
shuffle: False
|
||||
drop_last: False
|
||||
batch_size_per_card: 1 # must be 1
|
||||
num_workers: 8
|
|
@ -0,0 +1,51 @@
|
|||
===========================train_params===========================
|
||||
model_name:det_mv3_pse_v2.0
|
||||
python:python3.7
|
||||
gpu_list:0
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:fp32
|
||||
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
|
||||
Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
Global.save_inference_dir:./output/
|
||||
Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c test_tipc/configs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
##
|
||||
train_model:./inference/det_mv3_pse/best_accuracy
|
||||
infer_export:tools/export_model.py -c test_tipc/cconfigs/det_mv3_pse_v2.0/det_mv3_pse.yml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer/predict_det.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:True|False
|
||||
--cpu_threads:1|6
|
||||
--rec_batch_num:1
|
||||
--use_tensorrt:False|True
|
||||
--precision:fp32|fp16|int8
|
||||
--det_model_dir:
|
||||
--image_dir:./inference/ch_det_data_50/all-sum-510/
|
||||
--save_log_path:null
|
||||
--benchmark:True
|
||||
--det_algorithm:PSE
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue