PaddleOCR/notebook/notebook_ch/6.document_analysis/文档分析实战-VQA.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# DOC-VQA SER实战\n",
    "\n",
    "本节将介绍如何使用PaddleOCR完成DOC-VQA SER算法的训练与运行，包括：\n",
    "\n",
    "1. 理解DOC-VQA SER算法原理\n",
    "2. 掌握PaddleOCR里DOC-VQA SER代码的训练流程\n",
    "\n",
    "## 1. 快速体验\n",
    "\n",
    "准备代码和环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: pip in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (21.3.1)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: shapely in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 1)) (1.8.0)\n",
      "Requirement already satisfied: scikit-image in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (0.19.1)\n",
      "Requirement already satisfied: imgaug==0.4.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (0.4.0)\n",
      "Requirement already satisfied: pyclipper in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 4)) (1.3.0.post2)\n",
      "Requirement already satisfied: lmdb in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 5)) (1.2.1)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 6)) (4.27.0)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 7)) (1.20.3)\n",
      "Requirement already satisfied: visualdl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.2.0)\n",
      "Requirement already satisfied: python-Levenshtein in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 9)) (0.12.2)\n",
      "Requirement already satisfied: opencv-contrib-python==4.4.0.46 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 10)) (4.4.0.46)\n",
      "Requirement already satisfied: cython in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 11)) (0.29)\n",
      "Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 12)) (4.7.1)\n",
      "Requirement already satisfied: premailer in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (3.10.0)\n",
      "Requirement already satisfied: openpyxl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (3.0.5)\n",
      "Requirement already satisfied: fasttext==0.9.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (0.9.1)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.15.0)\n",
      "Requirement already satisfied: matplotlib in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.2.3)\n",
      "Requirement already satisfied: Pillow in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (7.1.2)\n",
      "Requirement already satisfied: imageio in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.6.1)\n",
      "Requirement already satisfied: scipy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.6.3)\n",
      "Requirement already satisfied: opencv-python in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (4.1.1.26)\n",
      "Requirement already satisfied: pybind11>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->-r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (2.8.1)\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->-r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (56.2.0)\n",
      "Requirement already satisfied: tifffile>=2019.7.26 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2021.11.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (20.9)\n",
      "Requirement already satisfied: networkx>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2.4)\n",
      "Requirement already satisfied: PyWavelets>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (1.2.0)\n",
      "Requirement already satisfied: Flask-Babel>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.0.0)\n",
      "Requirement already satisfied: shellcheck-py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.7.1.1)\n",
      "Requirement already satisfied: protobuf>=3.11.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.14.0)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.5)\n",
      "Requirement already satisfied: flask>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.1)\n",
      "Requirement already satisfied: requests in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.22.0)\n",
      "Requirement already satisfied: bce-python-sdk in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.8.53)\n",
      "Requirement already satisfied: pre-commit in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.21.0)\n",
      "Requirement already satisfied: flake8>=3.7.9 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.8.2)\n",
      "Requirement already satisfied: cssutils in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (2.3.0)\n",
      "Requirement already satisfied: cachetools in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (4.0.0)\n",
      "Requirement already satisfied: cssselect in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (1.1.0)\n",
      "Requirement already satisfied: et-xmlfile in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->-r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (1.0.1)\n",
      "Requirement already satisfied: jdcal in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->-r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (1.4.1)\n",
      "Requirement already satisfied: importlib-metadata in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.23)\n",
      "Requirement already satisfied: pycodestyle<2.7.0,>=2.6.0a1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.6.0)\n",
      "Requirement already satisfied: mccabe<0.7.0,>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.6.1)\n",
      "Requirement already satisfied: pyflakes<2.3.0,>=2.2.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.2.0)\n",
      "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.16.0)\n",
      "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.0)\n",
      "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.11.0)\n",
      "Requirement already satisfied: click>=5.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (7.0)\n",
      "Requirement already satisfied: pytz in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2019.3)\n",
      "Requirement already satisfied: Babel>=2.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.8.0)\n",
      "Requirement already satisfied: decorator>=4.3.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from networkx>=2.2->scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (4.4.2)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from packaging>=20.0->scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2.4.2)\n",
      "Requirement already satisfied: pycryptodome>=3.8.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.9.9)\n",
      "Requirement already satisfied: future>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.18.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (0.10.0)\n",
      "Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.8.0)\n",
      "Requirement already satisfied: identify>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.4.10)\n",
      "Requirement already satisfied: nodeenv>=0.11.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.3.4)\n",
      "Requirement already satisfied: aspy.yaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.3.0)\n",
      "Requirement already satisfied: pyyaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (5.1.2)\n",
      "Requirement already satisfied: virtualenv>=15.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (16.7.9)\n",
      "Requirement already satisfied: toml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.10.0)\n",
      "Requirement already satisfied: cfgv>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.0.1)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.25.6)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.8)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2019.9.11)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Jinja2>=2.10.1->flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.1)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from importlib-metadata->flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.6.0)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: paddleocr in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (2.3.0.2)\n",
      "Requirement already satisfied: python-Levenshtein in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.12.2)\n",
      "Requirement already satisfied: opencv-contrib-python==4.4.0.46 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.4.0.46)\n",
      "Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.7.1)\n",
      "Requirement already satisfied: openpyxl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (3.0.5)\n",
      "Requirement already satisfied: scikit-image in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.19.1)\n",
      "Requirement already satisfied: shapely in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.8.0)\n",
      "Requirement already satisfied: premailer in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (3.10.0)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.20.3)\n",
      "Requirement already satisfied: pyclipper in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.3.0.post2)\n",
      "Requirement already satisfied: cython in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.29)\n",
      "Requirement already satisfied: fasttext==0.9.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.9.1)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.27.0)\n",
      "Requirement already satisfied: imgaug==0.4.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.4.0)\n",
      "Requirement already satisfied: visualdl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (2.2.0)\n",
      "Requirement already satisfied: lmdb in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.2.1)\n",
      "Requirement already satisfied: pybind11>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->paddleocr) (2.8.1)\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->paddleocr) (56.2.0)\n",
      "Requirement already satisfied: imageio in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (2.6.1)\n",
      "Requirement already satisfied: scipy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (1.6.3)\n",
      "Requirement already satisfied: opencv-python in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (4.1.1.26)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (1.15.0)\n",
      "Requirement already satisfied: Pillow in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (7.1.2)\n",
      "Requirement already satisfied: matplotlib in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (2.2.3)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (20.9)\n",
      "Requirement already satisfied: PyWavelets>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (1.2.0)\n",
      "Requirement already satisfied: tifffile>=2019.7.26 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (2021.11.2)\n",
      "Requirement already satisfied: networkx>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (2.4)\n",
      "Requirement already satisfied: jdcal in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->paddleocr) (1.4.1)\n",
      "Requirement already satisfied: et-xmlfile in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->paddleocr) (1.0.1)\n",
      "Requirement already satisfied: requests in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (2.22.0)\n",
      "Requirement already satisfied: cssutils in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (2.3.0)\n",
      "Requirement already satisfied: cachetools in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (4.0.0)\n",
      "Requirement already satisfied: cssselect in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: pre-commit in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.21.0)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.1.5)\n",
      "Requirement already satisfied: flask>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.1.1)\n",
      "Requirement already satisfied: protobuf>=3.11.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (3.14.0)\n",
      "Requirement already satisfied: bce-python-sdk in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (0.8.53)\n",
      "Requirement already satisfied: shellcheck-py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (0.7.1.1)\n",
      "Requirement already satisfied: flake8>=3.7.9 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (3.8.2)\n",
      "Requirement already satisfied: Flask-Babel>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.0.0)\n",
      "Requirement already satisfied: pyflakes<2.3.0,>=2.2.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (2.2.0)\n",
      "Requirement already satisfied: importlib-metadata in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (0.23)\n",
      "Requirement already satisfied: mccabe<0.7.0,>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (0.6.1)\n",
      "Requirement already satisfied: pycodestyle<2.7.0,>=2.6.0a1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (2.6.0)\n",
      "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (0.16.0)\n",
      "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: click>=5.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (7.0)\n",
      "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (2.11.0)\n",
      "Requirement already satisfied: Babel>=2.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2.8.0)\n",
      "Requirement already satisfied: pytz in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2019.3)\n",
      "Requirement already satisfied: decorator>=4.3.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from networkx>=2.2->scikit-image->paddleocr) (4.4.2)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from packaging>=20.0->scikit-image->paddleocr) (2.4.2)\n",
      "Requirement already satisfied: future>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->paddleocr) (0.18.0)\n",
      "Requirement already satisfied: pycryptodome>=3.8.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->paddleocr) (3.9.9)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (2.8.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (0.10.0)\n",
      "Requirement already satisfied: aspy.yaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.3.0)\n",
      "Requirement already satisfied: virtualenv>=15.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (16.7.9)\n",
      "Requirement already satisfied: pyyaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (5.1.2)\n",
      "Requirement already satisfied: cfgv>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (2.0.1)\n",
      "Requirement already satisfied: toml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (0.10.0)\n",
      "Requirement already satisfied: identify>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.4.10)\n",
      "Requirement already satisfied: nodeenv>=0.11.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.3.4)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (3.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (2019.9.11)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (2.8)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (1.25.6)\n",
      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Jinja2>=2.10.1->flask>=1.1.1->visualdl->paddleocr) (1.1.1)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from importlib-metadata->flake8>=3.7.9->visualdl->paddleocr) (3.6.0)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: yacs in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (0.1.8)\n",
      "Requirement already satisfied: gnureadline in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (8.0.0)\n",
      "Requirement already satisfied: paddlenlp==2.2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (2.2.1)\n",
      "Requirement already satisfied: seqeval in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (1.2.2)\n",
      "Requirement already satisfied: multiprocess in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.70.11.1)\n",
      "Requirement already satisfied: colorlog in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (4.1.0)\n",
      "Requirement already satisfied: colorama in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.4.4)\n",
      "Requirement already satisfied: h5py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (2.9.0)\n",
      "Requirement already satisfied: jieba in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.42.1)\n",
      "Requirement already satisfied: PyYAML in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from yacs) (5.1.2)\n",
      "Requirement already satisfied: numpy>=1.7 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from h5py->paddlenlp==2.2.1) (1.20.3)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from h5py->paddlenlp==2.2.1) (1.15.0)\n",
      "Requirement already satisfied: dill>=0.3.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from multiprocess->paddlenlp==2.2.1) (0.3.3)\n",
      "Requirement already satisfied: scikit-learn>=0.21.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from seqeval->paddlenlp==2.2.1) (0.24.2)\n",
      "Requirement already satisfied: scipy>=0.19.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (1.6.3)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (2.1.0)\n",
      "Requirement already satisfied: joblib>=0.11 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (0.14.1)\n"
     ]
    }
   ],
   "source": [
    "# clone PaddleOCR代码\n",
    "# ! git clone https://github.com/PaddlePaddle/PaddleOCR\n",
    "\n",
    "# 安装依赖包\n",
    "! pip install -U pip\n",
    "! pip install -r /home/aistudio/PaddleOCR/requirements.txt\n",
    "! pip install paddleocr\n",
    "\n",
    "# 安装依赖包\n",
    "! pip install yacs gnureadline paddlenlp==2.2.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 切换到 vqa 目录\n",
    "import os\n",
    "os.chdir('/home/aistudio/PaddleOCR/ppstructure/vqa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-12-22 16:03:11--  https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar\n",
      "Resolving paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)... 182.61.200.195, 182.61.200.229, 2409:8c04:1001:1002:0:ff:b001:368a\n",
      "Connecting to paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)|182.61.200.195|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1481431040 (1.4G) [application/x-tar]\n",
      "Saving to: ‘./inference/PP-Layout_v1.0_ser_pretrained.tar’\n",
      "\n",
      "PP-Layout_v1.0_ser_ 100%[===================>]   1.38G  45.5MB/s    in 31s     \n",
      "\n",
      "2021-12-22 16:03:42 (45.6 MB/s) - ‘./inference/PP-Layout_v1.0_ser_pretrained.tar’ saved [1481431040/1481431040]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 下载模型\n",
    "! mkdir inference\n",
    "# 下载SER模型的检测模型并解压\n",
    "! wget -P ./inference/ https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && cd inference && tar xf PP-Layout_v1.0_ser_pretrained.tar && cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "W1226 20:10:47.258977   900 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:10:47.263010   900 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:10:57] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n",
      "process: [0/1], save result to output/res_e2e/zh_val_42_ser.jpg\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "\u001b[0m"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.image.AxesImage at 0x7f034a4dfa90>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA8wAAAU7CAYAAAD8byz1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzs3Xm8XVV9///XWnvvM9x7cwmZiIRJhjDKlBBQ5AdIHX4gg1r5aq1Wii19gNUi8gBk1IRBfFjRWieqpfhTrEVpK6DfSqxSnChEQBCRGDCQEEJCQu69Z9p7r/X7Y5+1c+7lBIIiuQnv5+Nxe889Z5+9197nWPLenzUY7z0iIiIiIiIiMp7d0g0QERERERERmYwUmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPp4yQOzMeZNxpiHjDFLjTHnv9THFxEREREREdkc5qVch9kYEwG/AV4PPA78L/BO7/2vXrJGiIiIiIiIiGyGl7rCvABY6r1f5r3vAN8ATn6J2yAiIiIiIiLyvOKX+HhzgMd6/n4cOLx3A2PMXwN/DTA4ODhvn332eelaJyIiIiIiItu8u+++e433fubzbfdSB+bn5b3/EvAlgPnz5/u77rprC7dIREREREREtiXGmN9tznYvdZfsFcDOPX/v1H1OREREREREZFJ5qQPz/wJ7GWNeaYypAO8A/vMlboOIiIiIiIjI83pJu2R77zNjzPuB/wtEwFe89w+8lG0QERERERER2Rwv+Rhm7/2twK0v9XFFREREREREXoiXuku2iIiIiIiIyFZBgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6iLd0A15q3vst3QQRERERERF5AYwxW+S4L7sK85a60CIiIiIiIrJ1edkFZlWYRUREREREZHO87AKziIiIiIiIyOZQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERF5mfLeA2CM2cItERERmZwUmEVEREShWUREpA8FZhERkZexEJRDtVlEREQ2ird0A0RERGTLMMYoKIuIiDwHVZhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWeRlxhizpZsgIiIiIrJVUGAWeRlRWBYRERER2XwKzCIiIiIiIiJ9KDCLvIw45/DeY4zBe7+lmyMiIiIiMqkpMIu8jPQGZXXPFhERERF5bgrMIi8z1lpVl0VERERENoMCs8jLjMKyiIiIiMjmUWAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYREZkUjPcYk+NxGAfGOwwGn+dkGPCGzGQYY4rtjcE5h8lyvDfF897gjSE3YEyGcwZnPMYXr2H8s34MBpzDGDBQbA/kmSPzefG8NxhvcIDLPd51d2dSjDfkxnWPAx5D2nF4YzCAyR3G5Xjvu88ZDBkeg/MGj+nuyxXnbAzGGTxZsa13kHsIxybDeEdGcVwRERH544m3dANEREQAxpzln6/5HetHdsMlLcg8ucuACta0cdRI0pw88kSViNxl+DzF2xqWDaTeEvniP2u1akKeQSfPscbhvMP6CPA4A3hb3jGOog77H1ThgXvBp9CJc2IXEccROSNU4yFaLQ84nIEkNtTjDTSzKRz4qgq/uBuiyEMc4dMUE40yZ+fteerJMZrNKh3vqNgKkW2y7/51qhXYcWbCrd93+NzgzRjOR9SSKnmWMnWHJuufrAMJmcmJTcr+B9S4/5djtNKI2Fc59wLDQAWs93j9l1xEROSPRhVmERGZFOoGbPIUPl7D4YfVqSRrefNxwySJ503HDuK95y1/UaM+kBNH64gqa5k5o0piM17/+mGqHt74hhqHHlwj64zx5lMSKkRYE1HxMdaCtZbYRMTGYC3EkWPqdM+Dv1wDpkmS5MXrkSdNM+r1Z2iNdSDJyKKIqrFY0+aQQ7dj7u6WwWE45k9g1syYViOHKGG3Pduse/oRBiorwMfUbQXrcl7z2jqj6+Dh+0e4/5FnSLMW1hoqdohatU4nbeOco9Nq4TKPo6ie77ZrjSlTYLD+DIOVmCiGTg4dk+Fiv6U/NhERkW2aArOIiEwKPgNjaiRUwYCNLS0DLoUf3T5G3XoeuK9JHD0NQGws657ZQFLpYBOoxob/XpxiDGCrLH88o+1TWs7gLEU3Z2/wHjwQWY+jTd5x7LrzDOYdXoe4SWwM3ntia9l3/52oJBGkFp9C7lIq1ZynN8DaZ9qsWZtjclj5ZEpUM3Q6OXgLNsNiyZ0nigDb4p671rH3PrDvq6Ywfep2RAk4crafnbLf/nDg/nVmzMyYu/cMTJRhTU5sK0ydCkkMzifYyhpsMsJgBFUbYdRRTERE5I9KgVlERCaFLIIcS1Qdox5D4sFa2GmPOh2bQzzCo4+0OfDgORgyYl/DUmGHGUP4FKbv4PEmJYlh3nzLbx8YoUrOWX9jiJIGmXWAA5MTxTBthsXjaDXh6fWeJf/bJE9GSZ3DU4Te0fVw8GExSezwvkUUx6TNHNeBV+5UZdb0iJUrwLoIlxmSyDBlaBZ7770Xhx+7J1GtTcc5qCR0csMTKzKWPvwoDz74W6K4TWwca1fk/OZXHdoZrF0bcedPIHcJeZ5QGRxl5aoV/OqhB/F+DNeZQqc5XHQrh2IgtYiIiPzRKDCLiMikUHHF5Fqd9gDtFNrekeew8pGcaVOGOfzwaWSdDnvsBc7mZMZRS2D5irV4A089OcTwtDrewr0PrudPjt8en9f48j+vxbikqPwC+GJSrqee8EQUx/KRweR1fGMGlYrFxlAbgkcfTXngnjZHvDZhoGJoOUvbDZE5mDYL7lmSst0sILJUYsP++1uaTVj7JHzv1gZzd62RmIy8GZN1Bnhs5QZanRrp6I6krWHSLMGTkfrRohLtcpLEM3VmMSdZVE2oDszhmafm8Nqjd8PnAzjrwVBMQKa8LCIi8kelwCwiIpOCj2DXPXZgeLtBZs2EnXfckR1mwsEHRxx2OMzZBcjqjI7CrnPmsM++0zjwoCkkpsq06XDAobDgEMOMGWCaNdauBRM3ede7p2NNC2tyrPFY44ktGOvxvvgbB1mWgXHkrRzvR6glCZ08o92qct8v1pMMrqXqICY
      "text/plain": [
       "<Figure size 3456x1728 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 执行 SER 预测\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppstructure/vqa/infer_ser_e2e.py\n",
    "\n",
    "! python infer_ser_e2e.py \\\n",
    "    --model_name_or_path \"./inference/PP-Layout_v1.0_ser_pretrained/\" \\\n",
    "    --max_seq_length 512 \\\n",
    "    --output_dir \"output/res_e2e/\" \\\n",
    "    --infer_imgs \"images/input/zh_val_42.jpg\"\n",
    "    \n",
    "import cv2\n",
    "from matplotlib import pyplot as plt\n",
    "# 在notebook中使用matplotlib.pyplot绘图时，需要添加该命令进行显示\n",
    "%matplotlib inline\n",
    "\n",
    "img = cv2.imread('output/res_e2e/zh_val_42_ser.jpg')\n",
    "plt.figure(figsize=(48,24))\n",
    "plt.imshow(img)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## 2. 原理详解\n",
    "\n",
    "PaddleOCR中DOC-VQA系列算法目前基于[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)论文实现，提供了SER和RE两种任务\n",
    "\n",
    "LayoutXLM是LayoutLMV2的多语言版本，LayoutLMV2原理图如下：\n",
    "\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/82762e847487489ea92ead44679bbfbed5e5d0acbcf94a3081524ce50d29f513\" width=\"1000\" ></center>\n",
    "<br><center>图1 LayoutLMV2算法</center>\n",
    "\n",
    "相对于NLP中的Bert，LayoutXLM在模型的输入端新增了Image和图像内文字的Layout信息，LayoutXLM已经在PaddleNLP中实现，因此这里从模型前向的角度介绍数据和网络。\n",
    "\n",
    "### 2.1 输入数据处理\n",
    "\n",
    "首先对图像进行ocr识别或pdf解析，获取text及bbox信息，在此基础上构建模型的三个输入：\n",
    "\n",
    "1. Text Embedding\n",
    "\n",
    "\t首先对OCR识别的文本使用 WordPiece 进行切分，之后添加 [CLS] 和 [SEP] 标记，并用 [PAD] 补齐长度得到文本输入序列如下：\n",
    "  \n",
    "   $$S=\\{[CLS], w_1, w_2, \\cdots , [SEP], [PAD], [PAD], \\cdots \\}, |S|=L$$\n",
    "\t\n",
    "    再将词向量、一维位置向量、分段向量相加得到文本向量，公式如下：\n",
    "    \n",
    "   $$t_i=TokEmb(w_i)+PosEmb1D(i)+SegEmb(s_i), 0 \\leq i<L$$ \n",
    "    \n",
    "    一维位置向量：词的索引\n",
    "    \n",
    "    分段向量：A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "分词结果： ['▁我', '的中国', '心']\n",
      "转换为索引结果： {'input_ids': [0, 13129, 84072, 1801, 2], 'token_type_ids': [0, 0, 0, 0, 0]}\n"
     ]
    }
   ],
   "source": [
    "# Text Embedding 演示\n",
    "\n",
    "from paddlenlp.transformers import LayoutXLMTokenizer\n",
    "\n",
    "tokenizer = LayoutXLMTokenizer.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "# 分词\n",
    "print('分词结果：', tokenizer.tokenize('我的中国心'))\n",
    "# 转换为索引\n",
    "print('转换为索引结果：', tokenizer.encode('我的中国心'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "2. Image Embedding\n",
    "\n",
    "\t我们使用 ResNeXt-FPN 网络作为图像编码器，首先抽取原始文档图像的特征图，再将其平均池化为固定尺寸（B * 256 * 7 * 7），接着按行展开平均池化后的特征图（B * 256 * 49），之后经过线性投影（B * 49 * 256），就可以得到图像对应的特征序列。和文本向量的组成对应，图像向量也补充了一维相对位置和分段信息。最后将特征向量、一维位置向量、分段向量相加就可以得到最终的图像向量，如下所示：\n",
    "    \n",
    " \t$$v_i=Proj(VisTokEmb(I)_i)+PosEmb1D(i)+SegEmb([C]), 0 \\leq i<WH$$\n",
    "    \n",
    "    分段向量：C\n",
    "3. Layout Embedding\n",
    "\n",
    "\t对应于每个词或图像区域在页面中覆盖的坐标范围，使用平行于坐标轴的边界框（bounding box）表示布局信息，每个边界框用4个边界坐标值、宽、高来表示。最终的布局向量由6个特征对应的向量拼接得到：\n",
    "    \n",
    "   $$I_i=Concat(PosEmb2D_x(x_0, x_1, w), PosEmb2D_y(y_0, y_1, h)), 0 \\leq i<WH+L$$\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "下面演示预测过程中，从输入一张图像构建网络输入的过程，整个过程主要包含下面几个步骤\n",
    "\n",
    "1. 对图像进行OCR识别\n",
    "2. 对图像进行预处理，包括缩放到指定大小和归一化\n",
    "3. 对识别到的文本进行分词和转index\n",
    "4. 对文本框进行归一化，使其值在0-1000之间\n",
    "5. 对3，4处理后的结果进行pad，便于组batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:11:19] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Corrupt JPEG data: premature end of data segment\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['input_ids', 'token_type_ids', 'bbox', 'attention_mask', 'image', 'segment_offset_id'])\n",
      "[2, 3, 224, 224]\n"
     ]
    }
   ],
   "source": [
    "# 预测输入构建\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/vqa/vqa_utils.py\n",
    "\n",
    "import cv2\n",
    "import numpy as np\n",
    "import paddle\n",
    "from copy import deepcopy\n",
    "from paddleocr import PaddleOCR\n",
    "from paddlenlp.transformers import LayoutXLMTokenizer\n",
    "\n",
    "from infer_ser_e2e import trans_poly_to_bbox,pad_sentences,split_page\n",
    "\n",
    "def parse_ocr_info_for_ser(ocr_result):\n",
    "    # ocr结果转字典形式，文本框转换为外接矩形\n",
    "    ocr_info = []\n",
    "    for res in ocr_result:\n",
    "        ocr_info.append({\n",
    "            \"text\": res[1][0],\n",
    "            \"bbox\": trans_poly_to_bbox(res[0]),\n",
    "            \"poly\": res[0],\n",
    "        })\n",
    "    return ocr_info\n",
    "\n",
    "def preprocess(\n",
    "        tokenizer,\n",
    "        ori_img,\n",
    "        ocr_info,\n",
    "        img_size=(224, 224),\n",
    "        pad_token_label_id=-100,\n",
    "        max_seq_len=512,\n",
    "        add_special_ids=False,\n",
    "        return_attention_mask=True, ):\n",
    "    ocr_info = deepcopy(ocr_info)\n",
    "    height = ori_img.shape[0]\n",
    "    width = ori_img.shape[1]\n",
    "    \n",
    "    # 图像resize到指定shape\n",
    "    img = cv2.resize(ori_img, img_size).transpose([2, 0, 1]).astype(np.float32)\n",
    "    \n",
    "    segment_offset_id = [] # 存储了每个文本在input_ids里的结束位置\n",
    "    bbox_list = [] # 存储归一化到 0-1000 的box\n",
    "    input_ids_list = [] # 存储文本经过分词之后的文本段在词表里的索引\n",
    "    token_type_ids_list = [] # 存储文本段的类别信息\n",
    "\n",
    "    for info in ocr_info:\n",
    "        # box 归一化到 0-1000\n",
    "        # x1, y1, x2, y2\n",
    "        bbox = info[\"bbox\"]\n",
    "        bbox[0] = int(bbox[0] * 1000.0 / width)\n",
    "        bbox[2] = int(bbox[2] * 1000.0 / width)\n",
    "        bbox[1] = int(bbox[1] * 1000.0 / height)\n",
    "        bbox[3] = int(bbox[3] * 1000.0 / height)\n",
    "        \n",
    "        # 对 文本信息进行 tokenizer, 包含分词和转换为词表里的 index\n",
    "        text = info[\"text\"]\n",
    "        encode_res = tokenizer.encode(\n",
    "            text, pad_to_max_seq_len=False, return_attention_mask=True)\n",
    "        \n",
    "        # 根据参数决定是否删掉特殊的字符\n",
    "        if not add_special_ids:\n",
    "            # TODO: use tok.all_special_ids to remove\n",
    "            encode_res[\"input_ids\"] = encode_res[\"input_ids\"][1:-1]\n",
    "            encode_res[\"token_type_ids\"] = encode_res[\"token_type_ids\"][1:-1]\n",
    "            encode_res[\"attention_mask\"] = encode_res[\"attention_mask\"][1:-1]\n",
    "\n",
    "        input_ids_list.extend(encode_res[\"input_ids\"])\n",
    "        token_type_ids_list.extend(encode_res[\"token_type_ids\"])\n",
    "        bbox_list.extend([bbox] * len(encode_res[\"input_ids\"]))\n",
    "        segment_offset_id.append(len(input_ids_list))\n",
    "\n",
    "    encoded_inputs = {\n",
    "        \"input_ids\": input_ids_list,\n",
    "        \"token_type_ids\": token_type_ids_list,\n",
    "        \"bbox\": bbox_list,\n",
    "        \"attention_mask\": [1] * len(input_ids_list),\n",
    "    }\n",
    "    # 将 val pad到指定长度，不足长度的用 0 进行补充\n",
    "    encoded_inputs = pad_sentences(\n",
    "        tokenizer,\n",
    "        encoded_inputs,\n",
    "        max_seq_len=max_seq_len,\n",
    "        return_attention_mask=return_attention_mask)\n",
    "    \n",
    "    # input_ids> 512 时，划分为2个batch\n",
    "    ncoded_inputs = split_page(encoded_inputs)\n",
    "\n",
    "    fake_bs = encoded_inputs[\"input_ids\"].shape[0]\n",
    "\n",
    "    encoded_inputs[\"image\"] = paddle.to_tensor(img).unsqueeze(0).expand(\n",
    "        [fake_bs] + list(img.shape))\n",
    "\n",
    "    encoded_inputs[\"segment_offset_id\"] = segment_offset_id\n",
    "\n",
    "    return encoded_inputs\n",
    "\n",
    "img = cv2.imread('images/input/zh_val_42.jpg')\n",
    "\n",
    "ocr_engine = PaddleOCR(use_angle_cls=False,show_log=False)\n",
    "# 执行ocr识别\n",
    "ocr_result = ocr_engine.ocr(img, cls=False)\n",
    "# ocr结果转字典形式，文本框转换为外接矩形\n",
    "ocr_info = parse_ocr_info_for_ser(ocr_result)\n",
    "\n",
    "\n",
    "tokenizer = LayoutXLMTokenizer.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "# 对图像进行resize，\n",
    "# 对文本进行分词，转换为词典索引等操作，\n",
    "# 对box进行归一化\n",
    "max_seq_length = 512\n",
    "inputs = preprocess(tokenizer=tokenizer,ori_img=img,ocr_info=ocr_info,max_seq_len=max_seq_length, img_size=(224,224))\n",
    "\n",
    "print(inputs.keys())\n",
    "print(inputs['image'].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "经过处理后的数据为一个字典，包含下面几个字段:\n",
    "<center>\n",
    "  \n",
    "| 字段 | 含义 |\n",
    "|---|---|\n",
    "|image| resize为224*224的图像 |\n",
    "|bbox| 归一化到 0-1000 的box |\n",
    "|input_ids| 文本经过分词之后的文本段在词表里的索引 |\n",
    "|token_type_ids| 文本段的类别信息|\n",
    "|attention_mask| 对文本段进行mask的掩码，特殊字符对应位置标记为0，文本段对应位置标记为1 |\n",
    "|segment_offset_id| 记录了每个文本在input_ids里的结束位置|\n",
    "  \n",
    "<center/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### 2.2 SER网络\n",
    "\n",
    "SER: 语义实体识别 (Semantic Entity Recognition）, 可以完成对图像中的文本识别与分类。 \n",
    "SER网络LayoutXLMModel的输出上加了一个全连接的分类头，其网络代码如下："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/layoutxlm/modeling.py#L846\n",
    "\n",
    "from paddlenlp.transformers import LayoutXLMPretrainedModel\n",
    "from paddle import nn\n",
    "class LayoutXLMForTokenClassification(LayoutXLMPretrainedModel):\n",
    "    def __init__(self, layoutxlm, num_classes=2, dropout=None):\n",
    "        super(LayoutXLMForTokenClassification, self).__init__()\n",
    "        self.num_classes = num_classes\n",
    "        if isinstance(layoutxlm, dict):\n",
    "            self.layoutxlm = LayoutXLMModel(**layoutxlm)\n",
    "        else:\n",
    "            self.layoutxlm = layoutxlm\n",
    "        self.dropout = nn.Dropout(dropout if dropout is not None else self.layoutxlm.config[\"hidden_dropout_prob\"])\n",
    "        self.classifier = nn.Linear(self.layoutxlm.config[\"hidden_size\"],num_classes)\n",
    "        self.classifier.apply(self.init_weights)\n",
    "\n",
    "    def get_input_embeddings(self):\n",
    "        return self.layoutxlm.embeddings.word_embeddings\n",
    "\n",
    "    def forward(self, input_ids=None, bbox=None, image=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None):\n",
    "        # backbone 运算\n",
    "        outputs = self.layoutxlm(input_ids=input_ids, bbox=bbox, image=image, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask)\n",
    "        seq_length = input_ids.shape[1]\n",
    "        # head 运算\n",
    "        sequence_output, image_output = outputs[0][:, :seq_length], outputs[0][:, seq_length:]\n",
    "        sequence_output = self.dropout(sequence_output)\n",
    "        logits = self.classifier(sequence_output)\n",
    "\n",
    "        outputs = logits,\n",
    "        \n",
    "        # 计算loss\n",
    "        if labels is not None:\n",
    "            loss_fct = nn.CrossEntropyLoss()\n",
    "\n",
    "            if attention_mask is not None:\n",
    "                active_loss = attention_mask.reshape([-1, ]) == 1\n",
    "                active_logits = logits.reshape([-1, self.num_classes])[active_loss]\n",
    "                active_labels = labels.reshape([-1, ])[active_loss]\n",
    "                loss = loss_fct(active_logits, active_labels)\n",
    "            else:\n",
    "                loss = loss_fct(logits.reshape([-1, self.num_classes]),labels.reshape([-1, ]))\n",
    "            outputs = (loss, ) + outputs\n",
    "        return outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2, 512, 7]\n"
     ]
    }
   ],
   "source": [
    "# 初始化网络\n",
    "net = LayoutXLMForTokenClassification.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "net.eval()\n",
    "# 执行网络前向\n",
    "outputs = net(input_ids=inputs[\"input_ids\"],\n",
    "            bbox=inputs[\"bbox\"],\n",
    "            image=inputs[\"image\"],\n",
    "            token_type_ids=inputs[\"token_type_ids\"],\n",
    "            attention_mask=inputs[\"attention_mask\"])\n",
    "print(outputs[0].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### 2.3 后处理\n",
    "\n",
    "后处理主要完成将模型输出的文本锻的预测结果对应到文本上，并且将结果和ocr的结果进行结合，主要包含以下几个步骤\n",
    "\n",
    "1. 每一个文本，统计该文本下所有文本段预测的label\n",
    "2. 选取所有文本段预测最多的label作为该文本的label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "label2id_map: {'O': 0, 'B-QUESTION': 1, 'I-QUESTION': 2, 'B-ANSWER': 3, 'I-ANSWER': 4, 'B-HEADER': 5, 'I-HEADER': 6}\n",
      "label2id_map_for_draw: {'O': 0, 'B-QUESTION': 1, 'I-QUESTION': 1, 'B-ANSWER': 3, 'I-ANSWER': 3, 'B-HEADER': 5, 'I-HEADER': 5}\n",
      "id2label_map: {0: 'O', 1: 'QUESTION', 3: 'ANSWER', 5: 'HEADER'}\n",
      "[{'text': '个人信息登记表', 'bbox': [1026.0, 292.0, 1495.0, 377.0], 'poly': [[1027.0, 292.0], [1495.0, 300.0], [1494.0, 377.0], [1026.0, 369.0]], 'pred_id': 5, 'pred': 'HEADER'}, {'text': '申报学院（部门）：', 'bbox': [207.0, 424.0, 587.0, 475.0], 'poly': [[207.0, 424.0], [587.0, 424.0], [587.0, 475.0], [207.0, 475.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '出生', 'bbox': [1144.0, 526.0, 1218.0, 566.0], 'poly': [[1144.0, 526.0], [1218.0, 526.0], [1218.0, 566.0], [1144.0, 566.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '政治', 'bbox': [1616.0, 530.0, 1709.0, 570.0], 'poly': [[1616.0, 530.0], [1709.0, 530.0], [1709.0, 570.0], [1616.0, 570.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '1997年12月17日「面貌', 'bbox': [1298.0, 558.0, 1713.0, 644.0], 'poly': [[1301.0, 558.0], [1713.0, 571.0], [1711.0, 644.0], [1298.0, 631.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '岳欣欣', 'bbox': [491.0, 559.0, 653.0, 614.0], 'poly': [[491.0, 559.0], [653.0, 559.0], [653.0, 614.0], [491.0, 614.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '性别', 'bbox': [805.0, 559.0, 908.0, 618.0], 'poly': [[805.0, 559.0], [908.0, 559.0], [908.0, 618.0], [805.0, 618.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '群众', 'bbox': [1801.0, 552.0, 1886.0, 614.0], 'poly': [[1801.0, 552.0], [1886.0, 552.0], [1886.0, 614.0], [1801.0, 614.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '女', 'bbox': [1008.0, 563.0, 1070.0, 610.0], 'poly': [[1008.0, 563.0], [1070.0, 563.0], [1070.0, 610.0], [1008.0, 610.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '(拼音)', 'bbox': [207.0, 577.0, 354.0, 629.0], 'poly': [[207.0, 577.0], [354.0, 577.0], [354.0, 629.0], [207.0, 629.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '日期', 'bbox': [1126.0, 577.0, 1222.0, 632.0], 'poly': [[1126.0, 577.0], [1222.0, 577.0], [1222.0, 632.0], [1126.0, 632.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '婚育「', 'bbox': [1120.0, 634.0, 1272.0, 714.0], 'poly': [[1127.0, 634.0], [1272.0, 649.0], [1265.0, 714.0], [1120.0, 699.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '生源地/培养形式', 'bbox': [1506.0, 647.0, 1790.0, 698.0], 'poly': [[1506.0, 647.0], [1790.0, 647.0], [1790.0, 698.0], [1506.0, 698.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '未婚', 'bbox': [1270.0, 654.0, 1369.0, 709.0], 'poly': [[1270.0, 654.0], [1369.0, 654.0], [1369.0, 709.0], [1270.0, 709.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '寸报名照', 'bbox': [2140.0, 650.0, 2273.0, 702.0], 'poly': [[2140.0, 650.0], [2273.0, 650.0], [2273.0, 702.0], [2140.0, 702.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '中国', 'bbox': [443.0, 661.0, 528.0, 720.0], 'poly': [[443.0, 661.0], [528.0, 661.0], [528.0, 720.0], [443.0, 720.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '国籍', 'bbox': [244.0, 672.0, 336.0, 731.0], 'poly': [[244.0, 672.0], [336.0, 672.0], [336.0, 731.0], [244.0, 731.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '民族', 'bbox': [812.0, 672.0, 904.0, 734.0], 'poly': [[812.0, 672.0], [904.0, 672.0], [904.0, 734.0], [812.0, 734.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '辽宁省西丰县', 'bbox': [1841.0, 668.0, 2064.0, 731.0], 'poly': [[1843.0, 668.0], [2064.0, 677.0], [2062.0, 731.0], [1841.0, 723.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '状况', 'bbox': [1137.0, 713.0, 1218.0, 756.0], 'poly': [[1137.0, 713.0], [1218.0, 713.0], [1218.0, 756.0], [1137.0, 756.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '(应届毕业生填写）', 'bbox': [1528.0, 720.0, 1782.0, 760.0], 'poly': [[1528.0, 720.0], [1782.0, 720.0], [1782.0, 760.0], [1528.0, 760.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '粘贴处', 'bbox': [2140.0, 716.0, 2251.0, 767.0], 'poly': [[2140.0, 716.0], [2251.0, 716.0], [2251.0, 767.0], [2140.0, 767.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '现工作（学习)', 'bbox': [192.0, 767.0, 484.0, 822.0], 'poly': [[192.0, 767.0], [484.0, 771.0], [483.0, 822.0], [192.0, 818.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '大连海事大学', 'bbox': [542.0, 781.0, 794.0, 841.0], 'poly': [[544.0, 781.0], [
     ]
    }
   ],
   "source": [
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/vqa/vqa_utils.py\n",
    "\n",
    "import paddle\n",
    "import numpy as np\n",
    "\n",
    "from infer_ser_e2e import get_bio_label_maps\n",
    "\n",
    "label2id_map, id2label_map = get_bio_label_maps('labels/labels_ser.txt')\n",
    "\n",
    "def postprocess(attention_mask, preds, id2label_map):\n",
    "    if isinstance(preds, paddle.Tensor):\n",
    "        preds = preds.numpy()\n",
    "    preds = np.argmax(preds, axis=2)\n",
    "\n",
    "    preds_list = [[] for _ in range(preds.shape[0])]\n",
    "\n",
    "    # keep batch info\n",
    "    for i in range(preds.shape[0]):\n",
    "        for j in range(preds.shape[1]):\n",
    "            if attention_mask[i][j] == 1:\n",
    "                preds_list[i].append(id2label_map[preds[i][j]])\n",
    "\n",
    "    return preds_list\n",
    "\n",
    "def merge_preds_list_with_ocr_info(ocr_info, segment_offset_id, preds_list,\n",
    "                                   label2id_map_for_draw):\n",
    "    # \blist flatten\n",
    "    preds = [p for pred in preds_list for p in pred]\n",
    "    \n",
    "    # label2idx的字典转换为idx2label的字段，去掉B-和I-的前缀\n",
    "    id2label_map = dict()\n",
    "    for key in label2id_map_for_draw:\n",
    "        val = label2id_map_for_draw[key]\n",
    "        if key == \"O\":\n",
    "            id2label_map[val] = key\n",
    "        if key.startswith(\"B-\") or key.startswith(\"I-\"):\n",
    "            id2label_map[val] = key[2:]\n",
    "        else:\n",
    "            id2label_map[val] = key\n",
    "    print(\"id2label_map:\",id2label_map)\n",
    "    \n",
    "    # 对每一个文本，统计预测的label\n",
    "    for idx in range(len(segment_offset_id)):\n",
    "        if idx == 0:\n",
    "            start_id = 0\n",
    "        else:\n",
    "            start_id = segment_offset_id[idx - 1]\n",
    "    \n",
    "        end_id = segment_offset_id[idx]\n",
    "        # 拿出文本在 输出里的范围\n",
    "        curr_pred = preds[start_id:end_id]\n",
    "        # 拿出文本在 输出里的所有预测结果\n",
    "        curr_pred = [label2id_map_for_draw[p] for p in curr_pred]\n",
    "\n",
    "        if len(curr_pred) <= 0:\n",
    "            pred_id = 0\n",
    "        else:\n",
    "            # print(\"pred label:\",curr_pred)\n",
    "            # 对label进行计数\n",
    "            counts = np.bincount(curr_pred)\n",
    "            # print(\"counts:\",counts)\n",
    "            pred_id = np.argmax(counts)\n",
    "        ocr_info[idx][\"pred_id\"] = int(pred_id)\n",
    "        ocr_info[idx][\"pred\"] = id2label_map[int(pred_id)]\n",
    "        # print(\"pred label:\",id2label_map[int(pred_id)])\n",
    "    return ocr_info\n",
    "\n",
    "preds = postprocess(inputs[\"attention_mask\"], outputs[0], id2label_map)\n",
    "\n",
    "# I 开头的value label替换为 B开头的\n",
    "label2id_map_for_draw = dict()\n",
    "for key in label2id_map:\n",
    "    if key.startswith(\"I-\"):\n",
    "        label2id_map_for_draw[key] = label2id_map[\"B\" + key[1:]]\n",
    "    else:\n",
    "        label2id_map_for_draw[key] = label2id_map[key]\n",
    "print(\"label2id_map:\",label2id_map)\n",
    "print(\"label2id_map_for_draw:\",label2id_map_for_draw)\n",
    "# 将预测信息和ocr信息合并\n",
    "ocr_info_with_ser = merge_preds_list_with_ocr_info(ocr_info, inputs[\"segment_offset_id\"], preds, label2id_map_for_draw)\n",
    "print(ocr_info_with_ser)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## 3. 训练\n",
    "\n",
    "\n",
    "本节以XFUN中文数据集为例介绍如何完成SER模型的训练、评估与测试。\n",
    "\n",
    "### 3.1 数据准备\n",
    "\n",
    "这里使用[XFUN](https://github.com/doc-analysis/XFUND)数据集做为实验数据集。 \n",
    "XFUN数据集是微软提出的一个用于KIE任务的多语言数据集，共包含七个数据集，每个数据集包含149张训练集和50张验证集\n",
    "\n",
    "* ZH(中文)\n",
    "* JA(日语) \n",
    "* ES(西班牙)\n",
    "* FR(法语) \n",
    "* IT(意大利) \n",
    "* DE(德语)  \n",
    "* PT(葡萄牙)  \n",
    "\n",
    "本次实验选取中文数据集作为我们的演示数据集。法语数据集作为实践课程的数据集，数据集样例图如下图所示\n",
    "\n",
    "\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45\" width=\"1000\" ></center>\n",
    "<br><center>图2 数据集样例，左中文，右法语</center>\n",
    "\n",
    "\n",
    "可以运行如下指令完成中文数据集下载和解压，或者从 [https://github.com/doc-analysis/XFUND](https://github.com/doc-analysis/XFUND) 中自行下载。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File ‘XFUND.tar’ already there; not retrieving.\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "! wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar\n",
    "! tar -xf XFUND.tar\n",
    "\n",
    "# XFUN其他数据集使用下面的代码进行转换\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppstructure/vqa/helper/trans_xfun_data.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "运行上述指令后在 /home/aistudio/PaddleOCR/ppstructure/vqa/XFUND 目录下有2个文件夹，目录结构如下所示：\n",
    "\n",
    "```bash\n",
    "/home/aistudio/PaddleOCR/ppstructure/vqa/XFUND\n",
    "  └─ zh_train/        \t\t \t训练集\n",
    "      ├── image/\t\t\t\t图片存放文件夹\n",
    "      ├── xfun_normalize_train.json \t标注信息\n",
    "  └─ zh_val/        \t\t \t验证集\n",
    "      ├── image/\t\t\t图片存放文件夹\n",
    "      ├── xfun_normalize_val.json \t标注信息\n",
    "\n",
    "```\n",
    "\n",
    "该数据集的标注格式为\n",
    "\n",
    "```bash\n",
    "{\n",
    "    \"height\": 3508, # 图像高度\n",
    "    \"width\": 2480,  # 图像宽度\n",
    "    \"ocr_info\": [\n",
    "        {\n",
    "            \"text\": \"邮政地址:\",  # 单个文本内容\n",
    "            \"label\": \"question\", # 文本所属类别\n",
    "            \"bbox\": [261, 802, 483, 859], # 单个文本框\n",
    "            \"id\": 54,  # 文本索引\n",
    "            \"linking\": [[54, 60]], # 当前文本和其他文本的关系 [question, answer]\n",
    "            \"words\": []\n",
    "        },\n",
    "        {\n",
    "            \"text\": \"湖南省怀化市市辖区\",\n",
    "            \"label\": \"answer\",\n",
    "            \"bbox\": [487, 810, 862, 859],\n",
    "            \"id\": 60,\n",
    "            \"linking\": [[54, 60]],\n",
    "            \"words\": []\n",
    "        }\n",
    "    ]\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### 3.2 损失函数定义\n",
    "\n",
    "因为是多分类任务，loss使用 CrossEntropyLoss\n",
    "\n",
    "###  3.3 模型训练\n",
    "\n",
    "完成数据处理和损失函数定义后即可开始训练模型了。\n",
    "\n",
    "具体训练命令如下："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:12:07] root INFO: -----------  Configuration Arguments -----------\n",
      "[2021/12/26 20:12:07] root INFO: adam_epsilon: 1e-08\n",
      "[2021/12/26 20:12:07] root INFO: det_model_dir: None\n",
      "[2021/12/26 20:12:07] root INFO: eval_data_dir: XFUND/zh_val/image\n",
      "[2021/12/26 20:12:07] root INFO: eval_label_path: XFUND/zh_val/xfun_normalize_val.json\n",
      "[2021/12/26 20:12:07] root INFO: eval_steps: 10\n",
      "[2021/12/26 20:12:07] root INFO: evaluate_during_training: True\n",
      "[2021/12/26 20:12:07] root INFO: infer_imgs: None\n",
      "[2021/12/26 20:12:07] root INFO: label_map_path: ./labels/labels_ser.txt\n",
      "[2021/12/26 20:12:07] root INFO: learning_rate: 5e-05\n",
      "[2021/12/26 20:12:07] root INFO: max_grad_norm: 1.0\n",
      "[2021/12/26 20:12:07] root INFO: max_seq_length: 512\n",
      "[2021/12/26 20:12:07] root INFO: model_name_or_path: layoutxlm-base-uncased\n",
      "[2021/12/26 20:12:07] root INFO: num_train_epochs: 200\n",
      "[2021/12/26 20:12:07] root INFO: num_workers: 0\n",
      "[2021/12/26 20:12:07] root INFO: ocr_json_path: None\n",
      "[2021/12/26 20:12:07] root INFO: output_dir: ./output/ser/\n",
      "[2021/12/26 20:12:07] root INFO: per_gpu_eval_batch_size: 8\n",
      "[2021/12/26 20:12:07] root INFO: per_gpu_train_batch_size: 8\n",
      "[2021/12/26 20:12:07] root INFO: re_model_name_or_path: None\n",
      "[2021/12/26 20:12:07] root INFO: rec_model_dir: None\n",
      "[2021/12/26 20:12:07] root INFO: resume: False\n",
      "[2021/12/26 20:12:07] root INFO: seed: 2048\n",
      "[2021/12/26 20:12:07] root INFO: ser_model_type: LayoutXLM\n",
      "[2021/12/26 20:12:07] root INFO: train_data_dir: XFUND/zh_train/image\n",
      "[2021/12/26 20:12:07] root INFO: train_label_path: XFUND/zh_train/xfun_normalize_train.json\n",
      "[2021/12/26 20:12:07] root INFO: warmup_steps: 50\n",
      "[2021/12/26 20:12:07] root INFO: weight_decay: 0.0\n",
      "[2021/12/26 20:12:07] root INFO: ------------------------------------------------\n",
      "[2021-12-26 20:12:07,259] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/layoutxlm-base-uncased/sentencepiece.bpe.model\n",
      "[2021-12-26 20:12:07,928] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/layoutxlm-base-uncased/model_state.pdparams\n",
      "W1226 20:12:07.929606  1085 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:12:07.933472  1085 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:12:18] root INFO: train from scratch\n",
      "[2021/12/26 20:12:18] root INFO: ***** Running training *****\n",
      "[2021/12/26 20:12:18] root INFO:   Num examples = 149\n",
      "[2021/12/26 20:12:18] root INFO:   Num Epochs = 200\n",
      "[2021/12/26 20:12:18] root INFO:   Instantaneous batch size per GPU = 8\n",
      "[2021/12/26 20:12:18] root INFO:   Total train batch size (w. parallel, distributed) = 8\n",
      "[2021/12/26 20:12:18] root INFO:   Total optimization steps = 3800\n",
      "[2021/12/26 20:12:20] root INFO: epoch: [0/200], iter: [0/19], global_step:1, train loss: 1.983819, lr: 0.000001, avg_reader_cost: 1.32728 sec, avg_batch_cost: 1.49863 sec, avg_samples: 8.00000, ips: 5.33822 images/sec\n",
      "[2021/12/26 20:12:21] root INFO: epoch: [0/200], iter: [1/19], global_step:2, train loss: 1.935008, lr: 0.000002, avg_reader_cost: 0.61179 sec, avg_batch_cost: 0.72010 sec, avg_samples: 8.00000, ips: 11.10955 images/sec\n",
      "[2021/12/26 20:12:23] root INFO: epoch: [0/200], iter: [2/19], global_step:3, train loss: 1.957709, lr: 0.000003, avg_reader_cost: 0.75516 sec, avg_batch_cost: 0.85305 sec, avg_samples: 8.00000, ips: 9.37815 images/sec\n",
      "Corrupt JPEG data: 18 extraneous bytes before marker 0xc4\n",
      "[2021/12/26 20:12:24] root INFO: epoch: [0/200], iter: [3/19], global_step:4, train loss: 1.842568, lr: 0.000004, avg_reader_cost: 0.76927 sec, avg_batch_cost: 0.86650 sec, avg_samples: 8.00000, ips: 9.23258 images/sec\n",
      "[2021/12/26 20:12:25] root INFO: epoch: [0/200], iter: [4/19], global_step:5, train loss: 1.941558, lr: 0.000005, avg_reader_cost: 0.67992 sec, avg_batch_cost: 0.77854 sec, avg_samples: 8.00000, ips: 10.27559 images/sec\n",
      "[2021/12/26 20:12:26] root INFO: epoch: [0/200], iter: [5/19], global_step:6, train loss: 1.879326, lr: 0.000006, avg_reader_cost: 0.62112 sec, avg_batch_cost: 0.71867 sec, avg_samples: 8.00000, ips: 11.13167 images/sec\n",
      "[2021/12/26 20:12:27] root INFO: epoch: [0/200], iter: [6/19], global_step:7, train loss: 1.833748, lr: 0.000007, avg_reader_cost: 0.79442 sec, avg_batch_cost: 0.89132 sec, avg_samples: 8.00000, ips: 8.97544 images/sec\n",
      "[2021/12/26 20:12:29] root INFO: epoch: [0/200], iter: [7/19], global_step:8, train loss: 1.747398, lr: 0.000008, avg_reader_cost: 0.74634 sec, avg_batch_cost: 0.84421 sec, avg_samples: 8.00000, ips: 9.47633 images/sec\n",
      "[2021/12/26 20:12:30] root INFO: epoch: [0/200], iter: [8/19], global_step:9, train loss: 1.603032, lr: 0.000009, avg_reader_cost: 0.79887 sec, avg_batch_cost: 0.89827 sec, avg_samples: 8.00000, ips: 8.90600 images/sec\n",
      "[2021/12/26 20:12:31] root INFO: epoch: [0/200], iter: [9/19], global_step:10, train loss: 1.678029, lr: 0.000010, avg_reader_cost: 0.78243 sec, avg_batch_cost: 0.88950 sec, avg_samples: 8.00000, ips: 8.99385 images/sec\n",
      "[2021/12/26 20:12:33] root INFO: [Eval]process: 0/7, loss: 1.41839\n",
      "[2021/12/26 20:12:34] root INFO: [Eval]process: 1/7, loss: 1.60403\n",
      "[2021/12/26 20:12:35] root INFO: [Eval]process: 2/7, loss: 1.70345\n",
      "[2021/12/26 20:12:36] root INFO: [Eval]process: 3/7, loss: 1.60751\n",
      "[2021/12/26 20:12:38] root INFO: [Eval]process: 4/7, loss: 1.49639\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "[2021/12/26 20:12:39] root INFO: [Eval]process: 5/7, loss: 1.66062\n",
      "[2021/12/26 20:12:39] root INFO: [Eval]process: 6/7, loss: 1.56035\n",
      "[2021/12/26 20:12:40] root INFO: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      ANSWER       0.01      0.01      0.01      1514\n",
      "      HEADER       0.00      0.00      0.00        58\n",
      "    QUESTION       0.03      0.02      0.02      1155\n",
      "\n",
      "   micro avg       0.02      0.01      0.01      2727\n",
      "   macro avg       0.01      0.01      0.01      2727\n",
      "weighted avg       0.02      0.01      0.01      2727\n",
      "\n",
      "[2021/12/26 20:12:40] root INFO: ***** Eval results  *****\n",
      "[2021/12/26 20:12:40] root INFO:   f1 = 0.013078227173649792\n",
      "[2021/12/26 20:12:40] root INFO:   loss = 1.5786780970437186\n",
      "[2021/12/26 20:12:40] root INFO:   precision = 0.01925820256776034\n",
      "[2021/12/26 20:12:40] root INFO:   recall = 0.009900990099009901\n",
      "[2021/12/26 20:12:44] root INFO: Saving model checkpoint to ./output/ser/best_model\n",
      "[2021/12/26 20:12:44] root INFO: [epoch 0/200][iter: 9/19] results: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n",
      "[2021/12/26 20:12:44] root INFO: best metrics: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n",
      "^C\n",
      "Traceback (most recent call last):\n",
      "  File \"train_ser.py\", line 248, in <module>\n",
      "    train(args)\n",
      "  File \"train_ser.py\", line 178, in train\n",
      "    loss = loss_class(labels, outputs, batch['attention_mask'])\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\", line 914, in __call__\n",
      "    outputs = self.forward(*inputs, **kwargs)\n",
      "  File \"/home/aistudio/PaddleOCR/ppstructure/vqa/losses.py\", line 29, in forward\n",
      "    [-1, self.num_classes])[active_loss]\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/varbase_patch_methods.py\", line 594, in __getitem__\n",
      "    return _getitem_impl_(self, item)\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/variable_index.py\", line 403, in _getitem_impl_\n",
      "    bool_2_idx = where(slice_item == True)\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layers/nn.py\", line 14242, in where\n",
      "    return _C_ops.where_index(condition)\n",
      "KeyboardInterrupt\n"
     ]
    }
   ],
   "source": [
    "! python train_ser.py \\\n",
    "    --model_name_or_path \"layoutxlm-base-uncased\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --train_data_dir \"XFUND/zh_train/image\" \\\n",
    "    --train_label_path \"XFUND/zh_train/xfun_normalize_train.json\" \\\n",
    "    --eval_data_dir \"XFUND/zh_val/image\" \\\n",
    "    --eval_label_path \"XFUND/zh_val/xfun_normalize_val.json\" \\\n",
    "    --per_gpu_train_batch_size 8 \\\n",
    "    --per_gpu_eval_batch_size 8 \\\n",
    "    --num_train_epochs 200 \\\n",
    "    --eval_steps 10 \\\n",
    "    --output_dir \"./output/ser/\" \\\n",
    "    --learning_rate 5e-5 \\\n",
    "    --warmup_steps 50 \\\n",
    "    --evaluate_during_training \\\n",
    "    --num_workers 0 \\\n",
    "    --seed 2048"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### 3.4 模型评估\n",
    "\n",
    "训练过程中，默认保存两种模型，一种是latest命名的最新训练的模型，一种是best命名的精度最高的模型。保存模型的文件夹结构如下所示\n",
    "\n",
    "```bash\n",
    "output/ser/\n",
    "├── best_model\n",
    "│   ├── model_config.json   # 模型配置\n",
    "│   ├── model_state.pdparams # 模型参数\n",
    "│   ├── sentencepiece.bpe.model # 分词器的参数\n",
    "│   ├── tokenizer_config.json # tokenizer的配置\n",
    "│   └── training_args.bin # 启动训练时的参数\n",
    "├── infer_results.txt\n",
    "├── latest_model\n",
    "│   ├── model_config.json\n",
    "│   ├── model_state.pdparams\n",
    "│   ├── sentencepiece.bpe.model\n",
    "│   ├── tokenizer_config.json\n",
    "│   └── training_args.bin\n",
    "├── test_gt.txt\n",
    "├── test_pred.txt\n",
    "└── train.log   # 训练日志\n",
    "```\n",
    "\n",
    "接下来使用保存的模型参数评估在测试集上的准确率："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:13:05] root INFO: -----------  Configuration Arguments -----------\n",
      "[2021/12/26 20:13:05] root INFO: adam_epsilon: 1e-08\n",
      "[2021/12/26 20:13:05] root INFO: det_model_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: eval_data_dir: XFUND/zh_val/image\n",
      "[2021/12/26 20:13:05] root INFO: eval_label_path: XFUND/zh_val/xfun_normalize_val.json\n",
      "[2021/12/26 20:13:05] root INFO: eval_steps: 10\n",
      "[2021/12/26 20:13:05] root INFO: evaluate_during_training: False\n",
      "[2021/12/26 20:13:05] root INFO: infer_imgs: None\n",
      "[2021/12/26 20:13:05] root INFO: label_map_path: ./labels/labels_ser.txt\n",
      "[2021/12/26 20:13:05] root INFO: learning_rate: 5e-05\n",
      "[2021/12/26 20:13:05] root INFO: max_grad_norm: 1.0\n",
      "[2021/12/26 20:13:05] root INFO: max_seq_length: 512\n",
      "[2021/12/26 20:13:05] root INFO: model_name_or_path: output/ser/best_model\n",
      "[2021/12/26 20:13:05] root INFO: num_train_epochs: 3\n",
      "[2021/12/26 20:13:05] root INFO: num_workers: 8\n",
      "[2021/12/26 20:13:05] root INFO: ocr_json_path: None\n",
      "[2021/12/26 20:13:05] root INFO: output_dir: output/ser/\n",
      "[2021/12/26 20:13:05] root INFO: per_gpu_eval_batch_size: 8\n",
      "[2021/12/26 20:13:05] root INFO: per_gpu_train_batch_size: 8\n",
      "[2021/12/26 20:13:05] root INFO: re_model_name_or_path: None\n",
      "[2021/12/26 20:13:05] root INFO: rec_model_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: resume: False\n",
      "[2021/12/26 20:13:05] root INFO: seed: 2048\n",
      "[2021/12/26 20:13:05] root INFO: ser_model_type: LayoutXLM\n",
      "[2021/12/26 20:13:05] root INFO: train_data_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: train_label_path: None\n",
      "[2021/12/26 20:13:05] root INFO: warmup_steps: 0\n",
      "[2021/12/26 20:13:05] root INFO: weight_decay: 0.0\n",
      "[2021/12/26 20:13:05] root INFO: ------------------------------------------------\n",
      "W1226 20:13:05.816488  1230 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:13:05.820412  1230 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "[2021/12/26 20:13:18] root INFO: [Eval]process: 0/7, loss: 1.41839\n",
      "[2021/12/26 20:13:18] root INFO: [Eval]process: 1/7, loss: 1.60403\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 2/7, loss: 1.70345\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 3/7, loss: 1.60751\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 4/7, loss: 1.49639\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 5/7, loss: 1.66062\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 6/7, loss: 1.56035\n",
      "[2021/12/26 20:13:20] root INFO: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      ANSWER       0.01      0.01      0.01      1514\n",
      "      HEADER       0.00      0.00      0.00        58\n",
      "    QUESTION       0.03      0.02      0.02      1155\n",
      "\n",
      "   micro avg       0.02      0.01      0.01      2727\n",
      "   macro avg       0.01      0.01      0.01      2727\n",
      "weighted avg       0.02      0.01      0.01      2727\n",
      "\n",
      "[2021/12/26 20:13:20] root INFO: ***** Eval results  *****\n",
      "[2021/12/26 20:13:20] root INFO:   f1 = 0.013078227173649792\n",
      "[2021/12/26 20:13:20] root INFO:   loss = 1.5786780970437186\n",
      "[2021/12/26 20:13:20] root INFO:   precision = 0.01925820256776034\n",
      "[2021/12/26 20:13:20] root INFO:   recall = 0.009900990099009901\n",
      "[2021/12/26 20:13:20] root INFO: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n"
     ]
    }
   ],
   "source": [
    "! python eval_ser.py \\\n",
    "    --model_name_or_path \"output/ser/best_model\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --eval_data_dir \"XFUND/zh_val/image\" \\\n",
    "    --eval_label_path \"XFUND/zh_val/xfun_normalize_val.json\" \\\n",
    "    --per_gpu_eval_batch_size 8 \\\n",
    "    --num_workers 8 \\\n",
    "    --output_dir \"output/ser/\"  \\\n",
    "    --seed 2048"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "#### 3.5 模型预测\n",
    "\n",
    "训练好模型后，也可以使用保存好的模型，对单张图片或者某个文件夹的图像进行模型推理，观察模型预测效果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "W1226 20:07:23.831934   640 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:07:23.835953   640 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:07:33] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n",
      "process: [0/1], save result to output/ser_e2e/zh_val_42_ser.jpg\n",
      "Corrupt JPEG data: premature end of data segment\n"
     ]
    }
   ],
   "source": [
    "! python infer_ser_e2e.py \\\n",
    "    --model_name_or_path \"./inference/PP-Layout_v1.0_ser_pretrained/\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --max_seq_length 512 \\\n",
    "    --output_dir \"output/ser_e2e/\" \\\n",
    "    --infer_imgs \"images/input/zh_val_42.jpg\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## 4 作业\n",
    "\n",
    "实验题\n",
    "\n",
    "[https://aistudio.baidu.com/aistudio/projectdetail/3281385](https://aistudio.baidu.com/aistudio/projectdetail/3281385)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "py35-paddle1.2.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}