PaddleOCR/notebook/notebook_en/6.document_analysis/document_analysis_practice-...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DOC-VQA SER Practice\n",
    "\n",
    "This section will introduce how to use PaddleOCR to complete the training and operation of the DOC-VQA SER algorithm, including:\n",
    "\n",
    "1. Understand the principle of DOC-VQA SER algorithm\n",
    "2. Master the training process of DOC-VQA SER code in PaddleOCR\n",
    "\n",
    "## 1 Quick Experience\n",
    "\n",
    "Prepare code and environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: pip in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (21.3.1)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: shapely in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 1)) (1.8.0)\n",
      "Requirement already satisfied: scikit-image in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (0.19.1)\n",
      "Requirement already satisfied: imgaug==0.4.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (0.4.0)\n",
      "Requirement already satisfied: pyclipper in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 4)) (1.3.0.post2)\n",
      "Requirement already satisfied: lmdb in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 5)) (1.2.1)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 6)) (4.27.0)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 7)) (1.20.3)\n",
      "Requirement already satisfied: visualdl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.2.0)\n",
      "Requirement already satisfied: python-Levenshtein in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 9)) (0.12.2)\n",
      "Requirement already satisfied: opencv-contrib-python==4.4.0.46 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 10)) (4.4.0.46)\n",
      "Requirement already satisfied: cython in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 11)) (0.29)\n",
      "Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 12)) (4.7.1)\n",
      "Requirement already satisfied: premailer in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (3.10.0)\n",
      "Requirement already satisfied: openpyxl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (3.0.5)\n",
      "Requirement already satisfied: fasttext==0.9.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from -r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (0.9.1)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.15.0)\n",
      "Requirement already satisfied: matplotlib in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.2.3)\n",
      "Requirement already satisfied: Pillow in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (7.1.2)\n",
      "Requirement already satisfied: imageio in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.6.1)\n",
      "Requirement already satisfied: scipy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.6.3)\n",
      "Requirement already satisfied: opencv-python in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (4.1.1.26)\n",
      "Requirement already satisfied: pybind11>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->-r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (2.8.1)\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->-r /home/aistudio/PaddleOCR/requirements.txt (line 15)) (56.2.0)\n",
      "Requirement already satisfied: tifffile>=2019.7.26 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2021.11.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (20.9)\n",
      "Requirement already satisfied: networkx>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2.4)\n",
      "Requirement already satisfied: PyWavelets>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (1.2.0)\n",
      "Requirement already satisfied: Flask-Babel>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.0.0)\n",
      "Requirement already satisfied: shellcheck-py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.7.1.1)\n",
      "Requirement already satisfied: protobuf>=3.11.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.14.0)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.5)\n",
      "Requirement already satisfied: flask>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.1)\n",
      "Requirement already satisfied: requests in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.22.0)\n",
      "Requirement already satisfied: bce-python-sdk in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.8.53)\n",
      "Requirement already satisfied: pre-commit in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.21.0)\n",
      "Requirement already satisfied: flake8>=3.7.9 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.8.2)\n",
      "Requirement already satisfied: cssutils in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (2.3.0)\n",
      "Requirement already satisfied: cachetools in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (4.0.0)\n",
      "Requirement already satisfied: cssselect in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->-r /home/aistudio/PaddleOCR/requirements.txt (line 13)) (1.1.0)\n",
      "Requirement already satisfied: et-xmlfile in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->-r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (1.0.1)\n",
      "Requirement already satisfied: jdcal in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->-r /home/aistudio/PaddleOCR/requirements.txt (line 14)) (1.4.1)\n",
      "Requirement already satisfied: importlib-metadata in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.23)\n",
      "Requirement already satisfied: pycodestyle<2.7.0,>=2.6.0a1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.6.0)\n",
      "Requirement already satisfied: mccabe<0.7.0,>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.6.1)\n",
      "Requirement already satisfied: pyflakes<2.3.0,>=2.2.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.2.0)\n",
      "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.16.0)\n",
      "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.0)\n",
      "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.11.0)\n",
      "Requirement already satisfied: click>=5.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (7.0)\n",
      "Requirement already satisfied: pytz in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2019.3)\n",
      "Requirement already satisfied: Babel>=2.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.8.0)\n",
      "Requirement already satisfied: decorator>=4.3.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from networkx>=2.2->scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (4.4.2)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from packaging>=20.0->scikit-image->-r /home/aistudio/PaddleOCR/requirements.txt (line 2)) (2.4.2)\n",
      "Requirement already satisfied: pycryptodome>=3.8.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.9.9)\n",
      "Requirement already satisfied: future>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.18.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (0.10.0)\n",
      "Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->-r /home/aistudio/PaddleOCR/requirements.txt (line 3)) (2.8.0)\n",
      "Requirement already satisfied: identify>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.4.10)\n",
      "Requirement already satisfied: nodeenv>=0.11.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.3.4)\n",
      "Requirement already satisfied: aspy.yaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.3.0)\n",
      "Requirement already satisfied: pyyaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (5.1.2)\n",
      "Requirement already satisfied: virtualenv>=15.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (16.7.9)\n",
      "Requirement already satisfied: toml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (0.10.0)\n",
      "Requirement already satisfied: cfgv>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.0.1)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.25.6)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2.8)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (2019.9.11)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Jinja2>=2.10.1->flask>=1.1.1->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (1.1.1)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from importlib-metadata->flake8>=3.7.9->visualdl->-r /home/aistudio/PaddleOCR/requirements.txt (line 8)) (3.6.0)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: paddleocr in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (2.3.0.2)\n",
      "Requirement already satisfied: python-Levenshtein in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.12.2)\n",
      "Requirement already satisfied: opencv-contrib-python==4.4.0.46 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.4.0.46)\n",
      "Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.7.1)\n",
      "Requirement already satisfied: openpyxl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (3.0.5)\n",
      "Requirement already satisfied: scikit-image in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.19.1)\n",
      "Requirement already satisfied: shapely in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.8.0)\n",
      "Requirement already satisfied: premailer in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (3.10.0)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.20.3)\n",
      "Requirement already satisfied: pyclipper in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.3.0.post2)\n",
      "Requirement already satisfied: cython in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.29)\n",
      "Requirement already satisfied: fasttext==0.9.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.9.1)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (4.27.0)\n",
      "Requirement already satisfied: imgaug==0.4.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (0.4.0)\n",
      "Requirement already satisfied: visualdl in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (2.2.0)\n",
      "Requirement already satisfied: lmdb in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddleocr) (1.2.1)\n",
      "Requirement already satisfied: pybind11>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->paddleocr) (2.8.1)\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from fasttext==0.9.1->paddleocr) (56.2.0)\n",
      "Requirement already satisfied: imageio in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (2.6.1)\n",
      "Requirement already satisfied: scipy in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (1.6.3)\n",
      "Requirement already satisfied: opencv-python in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (4.1.1.26)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (1.15.0)\n",
      "Requirement already satisfied: Pillow in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (7.1.2)\n",
      "Requirement already satisfied: matplotlib in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from imgaug==0.4.0->paddleocr) (2.2.3)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (20.9)\n",
      "Requirement already satisfied: PyWavelets>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (1.2.0)\n",
      "Requirement already satisfied: tifffile>=2019.7.26 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (2021.11.2)\n",
      "Requirement already satisfied: networkx>=2.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-image->paddleocr) (2.4)\n",
      "Requirement already satisfied: jdcal in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->paddleocr) (1.4.1)\n",
      "Requirement already satisfied: et-xmlfile in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from openpyxl->paddleocr) (1.0.1)\n",
      "Requirement already satisfied: requests in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (2.22.0)\n",
      "Requirement already satisfied: cssutils in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (2.3.0)\n",
      "Requirement already satisfied: cachetools in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (4.0.0)\n",
      "Requirement already satisfied: cssselect in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from premailer->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: pre-commit in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.21.0)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.1.5)\n",
      "Requirement already satisfied: flask>=1.1.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.1.1)\n",
      "Requirement already satisfied: protobuf>=3.11.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (3.14.0)\n",
      "Requirement already satisfied: bce-python-sdk in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (0.8.53)\n",
      "Requirement already satisfied: shellcheck-py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (0.7.1.1)\n",
      "Requirement already satisfied: flake8>=3.7.9 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (3.8.2)\n",
      "Requirement already satisfied: Flask-Babel>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from visualdl->paddleocr) (1.0.0)\n",
      "Requirement already satisfied: pyflakes<2.3.0,>=2.2.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (2.2.0)\n",
      "Requirement already satisfied: importlib-metadata in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (0.23)\n",
      "Requirement already satisfied: mccabe<0.7.0,>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (0.6.1)\n",
      "Requirement already satisfied: pycodestyle<2.7.0,>=2.6.0a1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flake8>=3.7.9->visualdl->paddleocr) (2.6.0)\n",
      "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (0.16.0)\n",
      "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: click>=5.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (7.0)\n",
      "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from flask>=1.1.1->visualdl->paddleocr) (2.11.0)\n",
      "Requirement already satisfied: Babel>=2.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2.8.0)\n",
      "Requirement already satisfied: pytz in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2019.3)\n",
      "Requirement already satisfied: decorator>=4.3.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from networkx>=2.2->scikit-image->paddleocr) (4.4.2)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from packaging>=20.0->scikit-image->paddleocr) (2.4.2)\n",
      "Requirement already satisfied: future>=0.6.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->paddleocr) (0.18.0)\n",
      "Requirement already satisfied: pycryptodome>=3.8.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from bce-python-sdk->visualdl->paddleocr) (3.9.9)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (1.1.0)\n",
      "Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (2.8.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib->imgaug==0.4.0->paddleocr) (0.10.0)\n",
      "Requirement already satisfied: aspy.yaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.3.0)\n",
      "Requirement already satisfied: virtualenv>=15.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (16.7.9)\n",
      "Requirement already satisfied: pyyaml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (5.1.2)\n",
      "Requirement already satisfied: cfgv>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (2.0.1)\n",
      "Requirement already satisfied: toml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (0.10.0)\n",
      "Requirement already satisfied: identify>=1.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.4.10)\n",
      "Requirement already satisfied: nodeenv>=0.11.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from pre-commit->visualdl->paddleocr) (1.3.4)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (3.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (2019.9.11)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (2.8)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from requests->premailer->paddleocr) (1.25.6)\n",
      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from Jinja2>=2.10.1->flask>=1.1.1->visualdl->paddleocr) (1.1.1)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from importlib-metadata->flake8>=3.7.9->visualdl->paddleocr) (3.6.0)\n",
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: yacs in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (0.1.8)\n",
      "Requirement already satisfied: gnureadline in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (8.0.0)\n",
      "Requirement already satisfied: paddlenlp==2.2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (2.2.1)\n",
      "Requirement already satisfied: seqeval in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (1.2.2)\n",
      "Requirement already satisfied: multiprocess in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.70.11.1)\n",
      "Requirement already satisfied: colorlog in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (4.1.0)\n",
      "Requirement already satisfied: colorama in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.4.4)\n",
      "Requirement already satisfied: h5py in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (2.9.0)\n",
      "Requirement already satisfied: jieba in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from paddlenlp==2.2.1) (0.42.1)\n",
      "Requirement already satisfied: PyYAML in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from yacs) (5.1.2)\n",
      "Requirement already satisfied: numpy>=1.7 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from h5py->paddlenlp==2.2.1) (1.20.3)\n",
      "Requirement already satisfied: six in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from h5py->paddlenlp==2.2.1) (1.15.0)\n",
      "Requirement already satisfied: dill>=0.3.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from multiprocess->paddlenlp==2.2.1) (0.3.3)\n",
      "Requirement already satisfied: scikit-learn>=0.21.3 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from seqeval->paddlenlp==2.2.1) (0.24.2)\n",
      "Requirement already satisfied: scipy>=0.19.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (1.6.3)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (2.1.0)\n",
      "Requirement already satisfied: joblib>=0.11 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from scikit-learn>=0.21.3->seqeval->paddlenlp==2.2.1) (0.14.1)\n"
     ]
    }
   ],
   "source": [
    "# clone PaddleOCR code\n",
    "# ! git clone https://github.com/PaddlePaddle/PaddleOCR\n",
    "\n",
    "# Install dependencies\n",
    "! pip install -U pip\n",
    "! pip install -r /home/aistudio/PaddleOCR/requirements.txt\n",
    "! pip install paddleocr\n",
    "\n",
    "# Install dependencies\n",
    "! pip install yacs gnureadline paddlenlp==2.2.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# Change to the vqa directory\n",
    "import os\n",
    "os.chdir('/home/aistudio/PaddleOCR/ppstructure/vqa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-12-22 16:03:11--  https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar\n",
      "Resolving paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)... 182.61.200.195, 182.61.200.229, 2409:8c04:1001:1002:0:ff:b001:368a\n",
      "Connecting to paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)|182.61.200.195|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1481431040 (1.4G) [application/x-tar]\n",
      "Saving to: ‘./inference/PP-Layout_v1.0_ser_pretrained.tar’\n",
      "\n",
      "PP-Layout_v1.0_ser_ 100%[===================>]   1.38G  45.5MB/s    in 31s     \n",
      "\n",
      "2021-12-22 16:03:42 (45.6 MB/s) - ‘./inference/PP-Layout_v1.0_ser_pretrained.tar’ saved [1481431040/1481431040]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Download model\n",
    "! mkdir inference\n",
    "# Download the detection model of the SER model and unzip it\n",
    "! wget -P ./inference/ https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && cd inference && tar xf PP-Layout_v1.0_ser_pretrained.tar && cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "W1226 20:10:47.258977   900 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:10:47.263010   900 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:10:57] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n",
      "process: [0/1], save result to output/res_e2e/zh_val_42_ser.jpg\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "\u001b[0m"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.image.AxesImage at 0x7f034a4dfa90>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA8wAAAU7CAYAAAD8byz1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzs3Xm8XVV9///XWnvvM9x7cwmZiIRJhjDKlBBQ5AdIHX4gg1r5aq1Wii19gNUi8gBk1IRBfFjRWieqpfhTrEVpK6DfSqxSnChEQBCRGDCQEEJCQu69Z9p7r/X7Y5+1c+7lBIIiuQnv5+Nxe889Z5+9197nWPLenzUY7z0iIiIiIiIiMp7d0g0QERERERERmYwUmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPp4yQOzMeZNxpiHjDFLjTHnv9THFxEREREREdkc5qVch9kYEwG/AV4PPA78L/BO7/2vXrJGiIiIiIiIiGyGl7rCvABY6r1f5r3vAN8ATn6J2yAiIiIiIiLyvOKX+HhzgMd6/n4cOLx3A2PMXwN/DTA4ODhvn332eelaJyIiIiIiItu8u+++e433fubzbfdSB+bn5b3/EvAlgPnz5/u77rprC7dIREREREREtiXGmN9tznYvdZfsFcDOPX/v1H1OREREREREZFJ5qQPz/wJ7GWNeaYypAO8A/vMlboOIiIiIiIjI83pJu2R77zNjzPuB/wtEwFe89w+8lG0QERERERER2Rwv+Rhm7/2twK0v9XFFREREREREXoiXuku2iIiIiIiIyFZBgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6iLd0A15q3vst3QQRERERERF5AYwxW+S4L7sK85a60CIiIiIiIrJ1edkFZlWYRUREREREZHO87AKziIiIiIiIyOZQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERF5mfLeA2CM2cItERERmZwUmEVEREShWUREpA8FZhERkZexEJRDtVlEREQ2ird0A0RERGTLMMYoKIuIiDwHVZhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWeRlxhizpZsgIiIiIrJVUGAWeRlRWBYRERER2XwKzCIiIiIiIiJ9KDCLvIw45/DeY4zBe7+lmyMiIiIiMqkpMIu8jPQGZXXPFhERERF5bgrMIi8z1lpVl0VERERENoMCs8jLjMKyiIiIiMjmUWAWERERERER6UOBWURERERERKQPBWYRERERERGRPhSYRURERERERPpQYBYRERERERHpQ4FZREREREREpA8FZhEREREREZE+FJhFRERERERE+lBgFhEREREREelDgVlERERERESkDwVmERERERERkT4UmEVERERERET6UGAWERERERER6UOBWURERERERKQPBWYREZkUjPcYk+NxGAfGOwwGn+dkGPCGzGQYY4rtjcE5h8lyvDfF897gjSE3YEyGcwZnPMYXr2H8s34MBpzDGDBQbA/kmSPzefG8NxhvcIDLPd51d2dSjDfkxnWPAx5D2nF4YzCAyR3G5Xjvu88ZDBkeg/MGj+nuyxXnbAzGGTxZsa13kHsIxybDeEdGcVwRERH544m3dANEREQAxpzln6/5HetHdsMlLcg8ucuACta0cdRI0pw88kSViNxl+DzF2xqWDaTeEvniP2u1akKeQSfPscbhvMP6CPA4A3hb3jGOog77H1ThgXvBp9CJc2IXEccROSNU4yFaLQ84nIEkNtTjDTSzKRz4qgq/uBuiyEMc4dMUE40yZ+fteerJMZrNKh3vqNgKkW2y7/51qhXYcWbCrd93+NzgzRjOR9SSKnmWMnWHJuufrAMJmcmJTcr+B9S4/5djtNKI2Fc59wLDQAWs93j9l1xEROSPRhVmERGZFOoGbPIUPl7D4YfVqSRrefNxwySJ503HDuK95y1/UaM+kBNH64gqa5k5o0piM17/+mGqHt74hhqHHlwj64zx5lMSKkRYE1HxMdaCtZbYRMTGYC3EkWPqdM+Dv1wDpkmS5MXrkSdNM+r1Z2iNdSDJyKKIqrFY0+aQQ7dj7u6WwWE45k9g1syYViOHKGG3Pduse/oRBiorwMfUbQXrcl7z2jqj6+Dh+0e4/5FnSLMW1hoqdohatU4nbeOco9Nq4TKPo6ie77ZrjSlTYLD+DIOVmCiGTg4dk+Fiv6U/NhERkW2aArOIiEwKPgNjaiRUwYCNLS0DLoUf3T5G3XoeuK9JHD0NQGws657ZQFLpYBOoxob/XpxiDGCrLH88o+1TWs7gLEU3Z2/wHjwQWY+jTd5x7LrzDOYdXoe4SWwM3ntia9l3/52oJBGkFp9C7lIq1ZynN8DaZ9qsWZtjclj5ZEpUM3Q6OXgLNsNiyZ0nigDb4p671rH3PrDvq6Ywfep2RAk4crafnbLf/nDg/nVmzMyYu/cMTJRhTU5sK0ydCkkMzifYyhpsMsJgBFUbYdRRTERE5I9KgVlERCaFLIIcS1Qdox5D4sFa2GmPOh2bQzzCo4+0OfDgORgyYl/DUmGHGUP4FKbv4PEmJYlh3nzLbx8YoUrOWX9jiJIGmXWAA5MTxTBthsXjaDXh6fWeJf/bJE9GSZ3DU4Te0fVw8GExSezwvkUUx6TNHNeBV+5UZdb0iJUrwLoIlxmSyDBlaBZ7770Xhx+7J1GtTcc5qCR0csMTKzKWPvwoDz74W6K4TWwca1fk/OZXHdoZrF0bcedPIHcJeZ5QGRxl5aoV/OqhB/F+DNeZQqc5XHQrh2IgtYiIiPzRKDCLiMikUHHF5Fqd9gDtFNrekeew8pGcaVOGOfzwaWSdDnvsBc7mZMZRS2D5irV4A089OcTwtDrewr0PrudPjt8en9f48j+vxbikqPwC+GJSrqee8EQUx/KRweR1fGMGlYrFxlAbgkcfTXngnjZHvDZhoGJoOUvbDZE5mDYL7lmSst0sILJUYsP++1uaTVj7JHzv1gZzd62RmIy8GZN1Bnhs5QZanRrp6I6krWHSLMGTkfrRohLtcpLEM3VmMSdZVE2oDszhmafm8Nqjd8PnAzjrwVBMQKa8LCIi8kelwCwiIpOCj2DXPXZgeLtBZs2EnXfckR1mwsEHRxx2OMzZBcjqjI7CrnPmsM++0zjwoCkkpsq06XDAobDgEMOMGWCaNdauBRM3ede7p2NNC2tyrPFY44ktGOvxvvgbB1mWgXHkrRzvR6glCZ08o92qct8v1pMMrqXqICY
      "text/plain": [
       "<Figure size 3456x1728 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Perform SER forecast\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppstructure/vqa/infer_ser_e2e.py\n",
    "\n",
    "! python infer_ser_e2e.py \\\n",
    "    --model_name_or_path \"./inference/PP-Layout_v1.0_ser_pretrained/\" \\\n",
    "    --max_seq_length 512 \\\n",
    "    --output_dir \"output/res_e2e/\" \\\n",
    "    --infer_imgs \"images/input/zh_val_42.jpg\"\n",
    "    \n",
    "import cv2\n",
    "from matplotlib import pyplot as plt\n",
    "# When using matplotlib.pyplot to draw in the notebook, you need to add this command to display\n",
    "%matplotlib inline\n",
    "\n",
    "img = cv2.imread('output/res_e2e/zh_val_42_ser.jpg')\n",
    "plt.figure(figsize=(48,24))\n",
    "plt.imshow(img)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 Detailed Explanation of The Principle\n",
    "\n",
    "The DOC-VQA series algorithms in PaddleOCR are currently implemented based on the [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf) paper, providing two tasks: SER and RE\n",
    "\n",
    "LayoutXLM is a multi-language version of LayoutLMV2. The schematic diagram of LayoutLMV2 is as follows:\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/82762e847487489ea92ead44679bbfbed5e5d0acbcf94a3081524ce50d29f513\" width=\"1000\" ></center>\n",
    "<br><center>Figure 1: LayoutLMV2 algorithm</center>\n",
    "\n",
    "Compared with Bert in NLP, LayoutXLM adds Image and Layout information of text in the image to the input of the model. LayoutXLM has been implemented in PaddleNLP, so here we introduce the data and network from the perspective of the model forward.\n",
    "\n",
    "### 2.1 Input Data Processing\n",
    "\n",
    "First, perform ocr recognition or pdf analysis on the image, obtain text and bbox information, and build the three inputs of the model on this basis:\n",
    "\n",
    "1. Text Embedding\n",
    "\n",
    "\tFirst, use WordPiece to segment the text recognized by OCR, then add [CLS] and [SEP] tags, and use [PAD] to fill in the length to get the text input sequence as follows:\n",
    "  \n",
    "   $$S=\\{[CLS], w_1, w_2, \\cdots , [SEP], [PAD], [PAD], \\cdots \\}, |S|=L$$\n",
    "\t\n",
    "   Then add the word vector, one-dimensional position vector, and segment vector to get the text vector, the formula is as follows:\n",
    "    \n",
    "   $$t_i=TokEmb(w_i)+PosEmb1D(i)+SegEmb(s_i), 0 \\leq i<L$$ \n",
    "    \n",
    "   One-dimensional position vector: the index of the word\n",
    "    \n",
    "     Segmented vector: A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "分词结果： ['▁我', '的中国', '心']\n",
      "转换为索引结果： {'input_ids': [0, 13129, 84072, 1801, 2], 'token_type_ids': [0, 0, 0, 0, 0]}\n"
     ]
    }
   ],
   "source": [
    "# Text Embedding demo\n",
    "\n",
    "from paddlenlp.transformers import LayoutXLMTokenizer\n",
    "\n",
    "tokenizer = LayoutXLMTokenizer.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "# Participle\n",
    "print('分词结果：', tokenizer.tokenize('我的中国心'))\n",
    "# Convert to index\n",
    "print('转换为索引结果：', tokenizer.encode('我的中国心'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2. Image Embedding\n",
    "\n",
    "\tWe use the ResNeXt-FPN network as the image encoder, first extract the feature map of the original document image, then average it into a fixed size (B * 256 * 7 * 7), and then expand the average pooled feature map by row (B * 256 * 49), after linear projection (B * 49 * 256), the characteristic sequence corresponding to the image can be obtained. Corresponding to the composition of the text vector, the image vector is also supplemented with one-dimensional relative position and segmentation information. Finally, add the feature vector, one-dimensional position vector, and segment vector to get the final image vector, as shown below:\n",
    "    \n",
    " \t$$v_i=Proj(VisTokEmb(I)_i)+PosEmb1D(i)+SegEmb([C]), 0 \\leq i<WH$$\n",
    "    \n",
    "    Segmented vector：C\n",
    "3. Layout Embedding\n",
    "\n",
    "\tCorresponding to the coordinate range covered by each word or image area on the page, a bounding box parallel to the coordinate axis is used to represent the layout information, and each bounding box is represented by 4 boundary coordinate values, width, and height. The final layout vector is obtained by concatenating the vectors corresponding to the 6 features:\n",
    "    \n",
    "   $$I_i=Concat(PosEmb2D_x(x_0, x_1, w), PosEmb2D_y(y_0, y_1, h)), 0 \\leq i<WH+L$$\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following demonstrates the process of constructing a network input from an input image in the prediction process. The whole process mainly includes the following steps\n",
    "\n",
    "1. Perform OCR recognition on the image\n",
    "2. Preprocess the image, including scaling to a specified size and normalization\n",
    "3. Segment and index the recognized text\n",
    "4. Normalize the text box so that its value is between 0-1000\n",
    "5. Pad the results after processing 3 and 4 to facilitate batch grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:11:19] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Corrupt JPEG data: premature end of data segment\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['input_ids', 'token_type_ids', 'bbox', 'attention_mask', 'image', 'segment_offset_id'])\n",
      "[2, 3, 224, 224]\n"
     ]
    }
   ],
   "source": [
    "# Predictive input construction\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/vqa/vqa_utils.py\n",
    "\n",
    "import cv2\n",
    "import numpy as np\n",
    "import paddle\n",
    "from copy import deepcopy\n",
    "from paddleocr import PaddleOCR\n",
    "from paddlenlp.transformers import LayoutXLMTokenizer\n",
    "\n",
    "from infer_ser_e2e import trans_poly_to_bbox,pad_sentences,split_page\n",
    "\n",
    "def parse_ocr_info_for_ser(ocr_result):\n",
    "    # The ocr result is converted to dictionary form, and the text box is converted to a bounding rectangle\n",
    "    ocr_info = []\n",
    "    for res in ocr_result:\n",
    "        ocr_info.append({\n",
    "            \"text\": res[1][0],\n",
    "            \"bbox\": trans_poly_to_bbox(res[0]),\n",
    "            \"poly\": res[0],\n",
    "        })\n",
    "    return ocr_info\n",
    "\n",
    "def preprocess(\n",
    "        tokenizer,\n",
    "        ori_img,\n",
    "        ocr_info,\n",
    "        img_size=(224, 224),\n",
    "        pad_token_label_id=-100,\n",
    "        max_seq_len=512,\n",
    "        add_special_ids=False,\n",
    "        return_attention_mask=True, ):\n",
    "    ocr_info = deepcopy(ocr_info)\n",
    "    height = ori_img.shape[0]\n",
    "    width = ori_img.shape[1]\n",
    "    \n",
    "    # Resize the image to the specified shape\n",
    "    img = cv2.resize(ori_img, img_size).transpose([2, 0, 1]).astype(np.float32)\n",
    "    \n",
    "    segment_offset_id = [] # Stores the ending position of each text in input_ids\n",
    "    bbox_list = [] # Store a box normalized to 0-1000\n",
    "    input_ids_list = [] # Store the index of the text segment after the word segmentation in the vocabulary\n",
    "    token_type_ids_list = [] # Store the category information of the text segment\n",
    "\n",
    "    for info in ocr_info:\n",
    "        # box Normalized to 0-1000\n",
    "        # x1, y1, x2, y2\n",
    "        bbox = info[\"bbox\"]\n",
    "        bbox[0] = int(bbox[0] * 1000.0 / width)\n",
    "        bbox[2] = int(bbox[2] * 1000.0 / width)\n",
    "        bbox[1] = int(bbox[1] * 1000.0 / height)\n",
    "        bbox[3] = int(bbox[3] * 1000.0 / height)\n",
    "        \n",
    "        # Tokenizer the text information, including word segmentation and conversion to the index in the vocabulary\n",
    "        text = info[\"text\"]\n",
    "        encode_res = tokenizer.encode(\n",
    "            text, pad_to_max_seq_len=False, return_attention_mask=True)\n",
    "        \n",
    "        # Decide whether to delete special characters according to the parameters\n",
    "        if not add_special_ids:\n",
    "            # TODO: use tok.all_special_ids to remove\n",
    "            encode_res[\"input_ids\"] = encode_res[\"input_ids\"][1:-1]\n",
    "            encode_res[\"token_type_ids\"] = encode_res[\"token_type_ids\"][1:-1]\n",
    "            encode_res[\"attention_mask\"] = encode_res[\"attention_mask\"][1:-1]\n",
    "\n",
    "        input_ids_list.extend(encode_res[\"input_ids\"])\n",
    "        token_type_ids_list.extend(encode_res[\"token_type_ids\"])\n",
    "        bbox_list.extend([bbox] * len(encode_res[\"input_ids\"]))\n",
    "        segment_offset_id.append(len(input_ids_list))\n",
    "\n",
    "    encoded_inputs = {\n",
    "        \"input_ids\": input_ids_list,\n",
    "        \"token_type_ids\": token_type_ids_list,\n",
    "        \"bbox\": bbox_list,\n",
    "        \"attention_mask\": [1] * len(input_ids_list),\n",
    "    }\n",
    "    # Val pad to the specified length, and 0 to supplement the length that is not enough\n",
    "    encoded_inputs = pad_sentences(\n",
    "        tokenizer,\n",
    "        encoded_inputs,\n",
    "        max_seq_len=max_seq_len,\n",
    "        return_attention_mask=return_attention_mask)\n",
    "    \n",
    "   # input_ids> 512, divided into 2 batches\n",
    "    ncoded_inputs = split_page(encoded_inputs)\n",
    "\n",
    "    fake_bs = encoded_inputs[\"input_ids\"].shape[0]\n",
    "\n",
    "    encoded_inputs[\"image\"] = paddle.to_tensor(img).unsqueeze(0).expand(\n",
    "        [fake_bs] + list(img.shape))\n",
    "\n",
    "    encoded_inputs[\"segment_offset_id\"] = segment_offset_id\n",
    "\n",
    "    return encoded_inputs\n",
    "\n",
    "img = cv2.imread('images/input/zh_val_42.jpg')\n",
    "\n",
    "ocr_engine = PaddleOCR(use_angle_cls=False,show_log=False)\n",
    "# Perform ocr recognition\n",
    "ocr_result = ocr_engine.ocr(img, cls=False)\n",
    "# ocr The result is converted to dictionary form, and the text box is converted to a bounding rectangle\n",
    "ocr_info = parse_ocr_info_for_ser(ocr_result)\n",
    "\n",
    "\n",
    "tokenizer = LayoutXLMTokenizer.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "# Resize the image,\n",
    "# Perform word segmentation on the text, convert it to dictionary index and other operations,\n",
    "# Normalize the box\n",
    "max_seq_length = 512\n",
    "inputs = preprocess(tokenizer=tokenizer,ori_img=img,ocr_info=ocr_info,max_seq_len=max_seq_length, img_size=(224,224))\n",
    "\n",
    "print(inputs.keys())\n",
    "print(inputs['image'].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The processed data is a dictionary containing the following fields:\n",
    "<center>\n",
    "  \n",
    "| Field | Meaning |\n",
    "|---|---|\n",
    "|image| Image resize 224*224 |\n",
    "|bbox| Box normalized to 0-1000 |\n",
    "|input_ids| The index of the text segment after the text has been segmented in the vocabulary |\n",
    "|token_type_ids| Category information of the text segment|\n",
    "|attention_mask| Mask the text segment, the corresponding position of the special character is marked as 0, and the corresponding position of the text segment is marked as 1. |\n",
    "|segment_offset_id| Record the ending position of each text in input_ids|\n",
    "  \n",
    "<center/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 SER Network\n",
    "\n",
    "SER: Semantic Entity Recognition, which can recognize and classify text in images.\n",
    "A fully connected classification header is added to the output of the SER network LayoutXLMModel, and the network code is as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/layoutxlm/modeling.py#L846\n",
    "\n",
    "from paddlenlp.transformers import LayoutXLMPretrainedModel\n",
    "from paddle import nn\n",
    "class LayoutXLMForTokenClassification(LayoutXLMPretrainedModel):\n",
    "    def __init__(self, layoutxlm, num_classes=2, dropout=None):\n",
    "        super(LayoutXLMForTokenClassification, self).__init__()\n",
    "        self.num_classes = num_classes\n",
    "        if isinstance(layoutxlm, dict):\n",
    "            self.layoutxlm = LayoutXLMModel(**layoutxlm)\n",
    "        else:\n",
    "            self.layoutxlm = layoutxlm\n",
    "        self.dropout = nn.Dropout(dropout if dropout is not None else self.layoutxlm.config[\"hidden_dropout_prob\"])\n",
    "        self.classifier = nn.Linear(self.layoutxlm.config[\"hidden_size\"],num_classes)\n",
    "        self.classifier.apply(self.init_weights)\n",
    "\n",
    "    def get_input_embeddings(self):\n",
    "        return self.layoutxlm.embeddings.word_embeddings\n",
    "\n",
    "    def forward(self, input_ids=None, bbox=None, image=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None):\n",
    "        # backbone Calculation\n",
    "        outputs = self.layoutxlm(input_ids=input_ids, bbox=bbox, image=image, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask)\n",
    "        seq_length = input_ids.shape[1]\n",
    "        # head Calculation\n",
    "        sequence_output, image_output = outputs[0][:, :seq_length], outputs[0][:, seq_length:]\n",
    "        sequence_output = self.dropout(sequence_output)\n",
    "        logits = self.classifier(sequence_output)\n",
    "\n",
    "        outputs = logits,\n",
    "        \n",
    "        # Calculation loss\n",
    "        if labels is not None:\n",
    "            loss_fct = nn.CrossEntropyLoss()\n",
    "\n",
    "            if attention_mask is not None:\n",
    "                active_loss = attention_mask.reshape([-1, ]) == 1\n",
    "                active_logits = logits.reshape([-1, self.num_classes])[active_loss]\n",
    "                active_labels = labels.reshape([-1, ])[active_loss]\n",
    "                loss = loss_fct(active_logits, active_labels)\n",
    "            else:\n",
    "                loss = loss_fct(logits.reshape([-1, self.num_classes]),labels.reshape([-1, ]))\n",
    "            outputs = (loss, ) + outputs\n",
    "        return outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2, 512, 7]\n"
     ]
    }
   ],
   "source": [
    "# Initialize the network\n",
    "net = LayoutXLMForTokenClassification.from_pretrained('inference/PP-Layout_v1.0_ser_pretrained')\n",
    "net.eval()\n",
    "# Perform network forward\n",
    "outputs = net(input_ids=inputs[\"input_ids\"],\n",
    "            bbox=inputs[\"bbox\"],\n",
    "            image=inputs[\"image\"],\n",
    "            token_type_ids=inputs[\"token_type_ids\"],\n",
    "            attention_mask=inputs[\"attention_mask\"])\n",
    "print(outputs[0].shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.3 Post-Processing\n",
    "\n",
    "Post-processing mainly completes the correspondence between the predicted results of the text output of the model and the text, and combines the results with the results of OCR, mainly including the following steps\n",
    "\n",
    "1. For each text, count the predicted labels of all text segments under the text\n",
    "2. Select the label with the most predictions of all text segments as the label of the text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "label2id_map: {'O': 0, 'B-QUESTION': 1, 'I-QUESTION': 2, 'B-ANSWER': 3, 'I-ANSWER': 4, 'B-HEADER': 5, 'I-HEADER': 6}\n",
      "label2id_map_for_draw: {'O': 0, 'B-QUESTION': 1, 'I-QUESTION': 1, 'B-ANSWER': 3, 'I-ANSWER': 3, 'B-HEADER': 5, 'I-HEADER': 5}\n",
      "id2label_map: {0: 'O', 1: 'QUESTION', 3: 'ANSWER', 5: 'HEADER'}\n",
      "[{'text': '个人信息登记表', 'bbox': [1026.0, 292.0, 1495.0, 377.0], 'poly': [[1027.0, 292.0], [1495.0, 300.0], [1494.0, 377.0], [1026.0, 369.0]], 'pred_id': 5, 'pred': 'HEADER'}, {'text': '申报学院（部门）：', 'bbox': [207.0, 424.0, 587.0, 475.0], 'poly': [[207.0, 424.0], [587.0, 424.0], [587.0, 475.0], [207.0, 475.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '出生', 'bbox': [1144.0, 526.0, 1218.0, 566.0], 'poly': [[1144.0, 526.0], [1218.0, 526.0], [1218.0, 566.0], [1144.0, 566.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '政治', 'bbox': [1616.0, 530.0, 1709.0, 570.0], 'poly': [[1616.0, 530.0], [1709.0, 530.0], [1709.0, 570.0], [1616.0, 570.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '1997年12月17日「面貌', 'bbox': [1298.0, 558.0, 1713.0, 644.0], 'poly': [[1301.0, 558.0], [1713.0, 571.0], [1711.0, 644.0], [1298.0, 631.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '岳欣欣', 'bbox': [491.0, 559.0, 653.0, 614.0], 'poly': [[491.0, 559.0], [653.0, 559.0], [653.0, 614.0], [491.0, 614.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '性别', 'bbox': [805.0, 559.0, 908.0, 618.0], 'poly': [[805.0, 559.0], [908.0, 559.0], [908.0, 618.0], [805.0, 618.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '群众', 'bbox': [1801.0, 552.0, 1886.0, 614.0], 'poly': [[1801.0, 552.0], [1886.0, 552.0], [1886.0, 614.0], [1801.0, 614.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '女', 'bbox': [1008.0, 563.0, 1070.0, 610.0], 'poly': [[1008.0, 563.0], [1070.0, 563.0], [1070.0, 610.0], [1008.0, 610.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '(拼音)', 'bbox': [207.0, 577.0, 354.0, 629.0], 'poly': [[207.0, 577.0], [354.0, 577.0], [354.0, 629.0], [207.0, 629.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '日期', 'bbox': [1126.0, 577.0, 1222.0, 632.0], 'poly': [[1126.0, 577.0], [1222.0, 577.0], [1222.0, 632.0], [1126.0, 632.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '婚育「', 'bbox': [1120.0, 634.0, 1272.0, 714.0], 'poly': [[1127.0, 634.0], [1272.0, 649.0], [1265.0, 714.0], [1120.0, 699.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '生源地/培养形式', 'bbox': [1506.0, 647.0, 1790.0, 698.0], 'poly': [[1506.0, 647.0], [1790.0, 647.0], [1790.0, 698.0], [1506.0, 698.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '未婚', 'bbox': [1270.0, 654.0, 1369.0, 709.0], 'poly': [[1270.0, 654.0], [1369.0, 654.0], [1369.0, 709.0], [1270.0, 709.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '寸报名照', 'bbox': [2140.0, 650.0, 2273.0, 702.0], 'poly': [[2140.0, 650.0], [2273.0, 650.0], [2273.0, 702.0], [2140.0, 702.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '中国', 'bbox': [443.0, 661.0, 528.0, 720.0], 'poly': [[443.0, 661.0], [528.0, 661.0], [528.0, 720.0], [443.0, 720.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '国籍', 'bbox': [244.0, 672.0, 336.0, 731.0], 'poly': [[244.0, 672.0], [336.0, 672.0], [336.0, 731.0], [244.0, 731.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '民族', 'bbox': [812.0, 672.0, 904.0, 734.0], 'poly': [[812.0, 672.0], [904.0, 672.0], [904.0, 734.0], [812.0, 734.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '辽宁省西丰县', 'bbox': [1841.0, 668.0, 2064.0, 731.0], 'poly': [[1843.0, 668.0], [2064.0, 677.0], [2062.0, 731.0], [1841.0, 723.0]], 'pred_id': 3, 'pred': 'ANSWER'}, {'text': '状况', 'bbox': [1137.0, 713.0, 1218.0, 756.0], 'poly': [[1137.0, 713.0], [1218.0, 713.0], [1218.0, 756.0], [1137.0, 756.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '(应届毕业生填写）', 'bbox': [1528.0, 720.0, 1782.0, 760.0], 'poly': [[1528.0, 720.0], [1782.0, 720.0], [1782.0, 760.0], [1528.0, 760.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '粘贴处', 'bbox': [2140.0, 716.0, 2251.0, 767.0], 'poly': [[2140.0, 716.0], [2251.0, 716.0], [2251.0, 767.0], [2140.0, 767.0]], 'pred_id': 0, 'pred': 'O'}, {'text': '现工作（学习)', 'bbox': [192.0, 767.0, 484.0, 822.0], 'poly': [[192.0, 767.0], [484.0, 771.0], [483.0, 822.0], [192.0, 818.0]], 'pred_id': 1, 'pred': 'QUESTION'}, {'text': '大连海事大学', 'bbox': [542.0, 781.0, 794.0, 841.0], 'poly': [[544.0, 781.0], [
     ]
    }
   ],
   "source": [
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/vqa/vqa_utils.py\n",
    "\n",
    "import paddle\n",
    "import numpy as np\n",
    "\n",
    "from infer_ser_e2e import get_bio_label_maps\n",
    "\n",
    "label2id_map, id2label_map = get_bio_label_maps('labels/labels_ser.txt')\n",
    "\n",
    "def postprocess(attention_mask, preds, id2label_map):\n",
    "    if isinstance(preds, paddle.Tensor):\n",
    "        preds = preds.numpy()\n",
    "    preds = np.argmax(preds, axis=2)\n",
    "\n",
    "    preds_list = [[] for _ in range(preds.shape[0])]\n",
    "\n",
    "    # keep batch info\n",
    "    for i in range(preds.shape[0]):\n",
    "        for j in range(preds.shape[1]):\n",
    "            if attention_mask[i][j] == 1:\n",
    "                preds_list[i].append(id2label_map[preds[i][j]])\n",
    "\n",
    "    return preds_list\n",
    "\n",
    "def merge_preds_list_with_ocr_info(ocr_info, segment_offset_id, preds_list,\n",
    "                                   label2id_map_for_draw):\n",
    "    # \blist flatten\n",
    "    preds = [p for pred in preds_list for p in pred]\n",
    "    \n",
    "    # The dictionary of label2idx is converted to the field of idx2label, and the prefixes of B- and I- are removed\n",
    "    id2label_map = dict()\n",
    "    for key in label2id_map_for_draw:\n",
    "        val = label2id_map_for_draw[key]\n",
    "        if key == \"O\":\n",
    "            id2label_map[val] = key\n",
    "        if key.startswith(\"B-\") or key.startswith(\"I-\"):\n",
    "            id2label_map[val] = key[2:]\n",
    "        else:\n",
    "            id2label_map[val] = key\n",
    "    print(\"id2label_map:\",id2label_map)\n",
    "    \n",
    "    # For each text, count the predicted label\n",
    "    for idx in range(len(segment_offset_id)):\n",
    "        if idx == 0:\n",
    "            start_id = 0\n",
    "        else:\n",
    "            start_id = segment_offset_id[idx - 1]\n",
    "    \n",
    "        end_id = segment_offset_id[idx]\n",
    "        # Take out the range of text in the output\n",
    "        curr_pred = preds[start_id:end_id]\n",
    "        # Take out all the prediction results of the text in the output\n",
    "        curr_pred = [label2id_map_for_draw[p] for p in curr_pred]\n",
    "\n",
    "        if len(curr_pred) <= 0:\n",
    "            pred_id = 0\n",
    "        else:\n",
    "            # print(\"pred label:\",curr_pred)\n",
    "            # Count label\n",
    "            counts = np.bincount(curr_pred)\n",
    "            # print(\"counts:\",counts)\n",
    "            pred_id = np.argmax(counts)\n",
    "        ocr_info[idx][\"pred_id\"] = int(pred_id)\n",
    "        ocr_info[idx][\"pred\"] = id2label_map[int(pred_id)]\n",
    "        # print(\"pred label:\",id2label_map[int(pred_id)])\n",
    "    return ocr_info\n",
    "\n",
    "preds = postprocess(inputs[\"attention_mask\"], outputs[0], id2label_map)\n",
    "\n",
    "# Replace the value label at the beginning of I with the value label at the beginning of B\n",
    "label2id_map_for_draw = dict()\n",
    "for key in label2id_map:\n",
    "    if key.startswith(\"I-\"):\n",
    "        label2id_map_for_draw[key] = label2id_map[\"B\" + key[1:]]\n",
    "    else:\n",
    "        label2id_map_for_draw[key] = label2id_map[key]\n",
    "print(\"label2id_map:\",label2id_map)\n",
    "print(\"label2id_map_for_draw:\",label2id_map_for_draw)\n",
    "# Combine forecast information and ocr information\n",
    "ocr_info_with_ser = merge_preds_list_with_ocr_info(ocr_info, inputs[\"segment_offset_id\"], preds, label2id_map_for_draw)\n",
    "print(ocr_info_with_ser)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3 Training\n",
    "\n",
    "\n",
    "This section takes the XFUN Chinese data set as an example to introduce how to complete the training, evaluation and testing of the SER model.\n",
    "\n",
    "### 3.1 Data Preparation\n",
    "\n",
    "Here, the [XFUN](https://github.com/doc-analysis/XFUND) dataset is used as the experimental dataset.\n",
    "The XFUN data set is a multilingual data set for KIE tasks proposed by Microsoft. It contains a total of seven data sets, each of which contains 149 training sets and 50 validation sets.\n",
    "\n",
    "* ZH (Chinese)\n",
    "* JA (Japanese)\n",
    "* ES (Spain)\n",
    "* FR (French)\n",
    "* IT (Italy)\n",
    "* DE (German)\n",
    "* PT (Portugal)\n",
    "\n",
    "This experiment selects the Chinese data set as our demonstration data set. The French data set is used as a data set for practical courses. The sample data set is shown in the figure below\n",
    "\n",
    "\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45\" width=\"1000\" ></center>\n",
    "<br><center>Figure 2: Sample data set, left Chinese, right French</center>\n",
    "\n",
    "\n",
    "You can run the following commands to complete the download and decompression of the Chinese data set, or from [https://github.com/doc-analysis/XFUND](https://github.com/doc-analysis/XFUND) Download by yourself。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File ‘XFUND.tar’ already there; not retrieving.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "! wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar\n",
    "! tar -xf XFUND.tar\n",
    "\n",
    "#XFUN other data sets use the following code to convert\n",
    "# https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppstructure/vqa/helper/trans_xfun_data.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After running the above command, there are 2 folders in the /home/aistudio/PaddleOCR/ppstructure/vqa/XFUND directory, and the directory structure is as follows:\n",
    "\n",
    "```bash\n",
    "/home/aistudio/PaddleOCR/ppstructure/vqa/XFUND\n",
    "   └─ zh_train/ training set\n",
    "       ├── image/ image storage folder\n",
    "       ├── xfun_normalize_train.json label information\n",
    "   └─ zh_val/ verification set\n",
    "       ├── image/ image storage folder\n",
    "       ├── xfun_normalize_val.json label information\n",
    "\n",
    "```\n",
    "\n",
    "The label format of this dataset is\n",
    "\n",
    "```bash\n",
    "{\n",
    "    \"height\": 3508, # Image height\n",
    "    \"width\": 2480,  # Image width\n",
    "    \"ocr_info\": [\n",
    "        {\n",
    "            \"text\": \"邮政地址:\",  # Single text content\n",
    "            \"label\": \"question\", # The category of the text\n",
    "            \"bbox\": [261, 802, 483, 859], # Single text box\n",
    "            \"id\": 54,  # Text Index\n",
    "            \"linking\": [[54, 60]], # The relationship between the current text and other texts [question, answer]\n",
    "            \"words\": []\n",
    "        },\n",
    "        {\n",
    "            \"text\": \"湖南省怀化市市辖区\",\n",
    "            \"label\": \"answer\",\n",
    "            \"bbox\": [487, 810, 862, 859],\n",
    "            \"id\": 60,\n",
    "            \"linking\": [[54, 60]],\n",
    "            \"words\": []\n",
    "        }\n",
    "    ]\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 Loss Function Definition\n",
    "\n",
    "Because it is a multi-classification task, loss uses CrossEntropyLoss\n",
    "\n",
    "### 3.3 Model Training\n",
    "\n",
    "After completing the data processing and loss function definition, you can start training the model.\n",
    "\n",
    "The specific training commands are as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:12:07] root INFO: -----------  Configuration Arguments -----------\n",
      "[2021/12/26 20:12:07] root INFO: adam_epsilon: 1e-08\n",
      "[2021/12/26 20:12:07] root INFO: det_model_dir: None\n",
      "[2021/12/26 20:12:07] root INFO: eval_data_dir: XFUND/zh_val/image\n",
      "[2021/12/26 20:12:07] root INFO: eval_label_path: XFUND/zh_val/xfun_normalize_val.json\n",
      "[2021/12/26 20:12:07] root INFO: eval_steps: 10\n",
      "[2021/12/26 20:12:07] root INFO: evaluate_during_training: True\n",
      "[2021/12/26 20:12:07] root INFO: infer_imgs: None\n",
      "[2021/12/26 20:12:07] root INFO: label_map_path: ./labels/labels_ser.txt\n",
      "[2021/12/26 20:12:07] root INFO: learning_rate: 5e-05\n",
      "[2021/12/26 20:12:07] root INFO: max_grad_norm: 1.0\n",
      "[2021/12/26 20:12:07] root INFO: max_seq_length: 512\n",
      "[2021/12/26 20:12:07] root INFO: model_name_or_path: layoutxlm-base-uncased\n",
      "[2021/12/26 20:12:07] root INFO: num_train_epochs: 200\n",
      "[2021/12/26 20:12:07] root INFO: num_workers: 0\n",
      "[2021/12/26 20:12:07] root INFO: ocr_json_path: None\n",
      "[2021/12/26 20:12:07] root INFO: output_dir: ./output/ser/\n",
      "[2021/12/26 20:12:07] root INFO: per_gpu_eval_batch_size: 8\n",
      "[2021/12/26 20:12:07] root INFO: per_gpu_train_batch_size: 8\n",
      "[2021/12/26 20:12:07] root INFO: re_model_name_or_path: None\n",
      "[2021/12/26 20:12:07] root INFO: rec_model_dir: None\n",
      "[2021/12/26 20:12:07] root INFO: resume: False\n",
      "[2021/12/26 20:12:07] root INFO: seed: 2048\n",
      "[2021/12/26 20:12:07] root INFO: ser_model_type: LayoutXLM\n",
      "[2021/12/26 20:12:07] root INFO: train_data_dir: XFUND/zh_train/image\n",
      "[2021/12/26 20:12:07] root INFO: train_label_path: XFUND/zh_train/xfun_normalize_train.json\n",
      "[2021/12/26 20:12:07] root INFO: warmup_steps: 50\n",
      "[2021/12/26 20:12:07] root INFO: weight_decay: 0.0\n",
      "[2021/12/26 20:12:07] root INFO: ------------------------------------------------\n",
      "[2021-12-26 20:12:07,259] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/layoutxlm-base-uncased/sentencepiece.bpe.model\n",
      "[2021-12-26 20:12:07,928] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/layoutxlm-base-uncased/model_state.pdparams\n",
      "W1226 20:12:07.929606  1085 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:12:07.933472  1085 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:12:18] root INFO: train from scratch\n",
      "[2021/12/26 20:12:18] root INFO: ***** Running training *****\n",
      "[2021/12/26 20:12:18] root INFO:   Num examples = 149\n",
      "[2021/12/26 20:12:18] root INFO:   Num Epochs = 200\n",
      "[2021/12/26 20:12:18] root INFO:   Instantaneous batch size per GPU = 8\n",
      "[2021/12/26 20:12:18] root INFO:   Total train batch size (w. parallel, distributed) = 8\n",
      "[2021/12/26 20:12:18] root INFO:   Total optimization steps = 3800\n",
      "[2021/12/26 20:12:20] root INFO: epoch: [0/200], iter: [0/19], global_step:1, train loss: 1.983819, lr: 0.000001, avg_reader_cost: 1.32728 sec, avg_batch_cost: 1.49863 sec, avg_samples: 8.00000, ips: 5.33822 images/sec\n",
      "[2021/12/26 20:12:21] root INFO: epoch: [0/200], iter: [1/19], global_step:2, train loss: 1.935008, lr: 0.000002, avg_reader_cost: 0.61179 sec, avg_batch_cost: 0.72010 sec, avg_samples: 8.00000, ips: 11.10955 images/sec\n",
      "[2021/12/26 20:12:23] root INFO: epoch: [0/200], iter: [2/19], global_step:3, train loss: 1.957709, lr: 0.000003, avg_reader_cost: 0.75516 sec, avg_batch_cost: 0.85305 sec, avg_samples: 8.00000, ips: 9.37815 images/sec\n",
      "Corrupt JPEG data: 18 extraneous bytes before marker 0xc4\n",
      "[2021/12/26 20:12:24] root INFO: epoch: [0/200], iter: [3/19], global_step:4, train loss: 1.842568, lr: 0.000004, avg_reader_cost: 0.76927 sec, avg_batch_cost: 0.86650 sec, avg_samples: 8.00000, ips: 9.23258 images/sec\n",
      "[2021/12/26 20:12:25] root INFO: epoch: [0/200], iter: [4/19], global_step:5, train loss: 1.941558, lr: 0.000005, avg_reader_cost: 0.67992 sec, avg_batch_cost: 0.77854 sec, avg_samples: 8.00000, ips: 10.27559 images/sec\n",
      "[2021/12/26 20:12:26] root INFO: epoch: [0/200], iter: [5/19], global_step:6, train loss: 1.879326, lr: 0.000006, avg_reader_cost: 0.62112 sec, avg_batch_cost: 0.71867 sec, avg_samples: 8.00000, ips: 11.13167 images/sec\n",
      "[2021/12/26 20:12:27] root INFO: epoch: [0/200], iter: [6/19], global_step:7, train loss: 1.833748, lr: 0.000007, avg_reader_cost: 0.79442 sec, avg_batch_cost: 0.89132 sec, avg_samples: 8.00000, ips: 8.97544 images/sec\n",
      "[2021/12/26 20:12:29] root INFO: epoch: [0/200], iter: [7/19], global_step:8, train loss: 1.747398, lr: 0.000008, avg_reader_cost: 0.74634 sec, avg_batch_cost: 0.84421 sec, avg_samples: 8.00000, ips: 9.47633 images/sec\n",
      "[2021/12/26 20:12:30] root INFO: epoch: [0/200], iter: [8/19], global_step:9, train loss: 1.603032, lr: 0.000009, avg_reader_cost: 0.79887 sec, avg_batch_cost: 0.89827 sec, avg_samples: 8.00000, ips: 8.90600 images/sec\n",
      "[2021/12/26 20:12:31] root INFO: epoch: [0/200], iter: [9/19], global_step:10, train loss: 1.678029, lr: 0.000010, avg_reader_cost: 0.78243 sec, avg_batch_cost: 0.88950 sec, avg_samples: 8.00000, ips: 8.99385 images/sec\n",
      "[2021/12/26 20:12:33] root INFO: [Eval]process: 0/7, loss: 1.41839\n",
      "[2021/12/26 20:12:34] root INFO: [Eval]process: 1/7, loss: 1.60403\n",
      "[2021/12/26 20:12:35] root INFO: [Eval]process: 2/7, loss: 1.70345\n",
      "[2021/12/26 20:12:36] root INFO: [Eval]process: 3/7, loss: 1.60751\n",
      "[2021/12/26 20:12:38] root INFO: [Eval]process: 4/7, loss: 1.49639\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "[2021/12/26 20:12:39] root INFO: [Eval]process: 5/7, loss: 1.66062\n",
      "[2021/12/26 20:12:39] root INFO: [Eval]process: 6/7, loss: 1.56035\n",
      "[2021/12/26 20:12:40] root INFO: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      ANSWER       0.01      0.01      0.01      1514\n",
      "      HEADER       0.00      0.00      0.00        58\n",
      "    QUESTION       0.03      0.02      0.02      1155\n",
      "\n",
      "   micro avg       0.02      0.01      0.01      2727\n",
      "   macro avg       0.01      0.01      0.01      2727\n",
      "weighted avg       0.02      0.01      0.01      2727\n",
      "\n",
      "[2021/12/26 20:12:40] root INFO: ***** Eval results  *****\n",
      "[2021/12/26 20:12:40] root INFO:   f1 = 0.013078227173649792\n",
      "[2021/12/26 20:12:40] root INFO:   loss = 1.5786780970437186\n",
      "[2021/12/26 20:12:40] root INFO:   precision = 0.01925820256776034\n",
      "[2021/12/26 20:12:40] root INFO:   recall = 0.009900990099009901\n",
      "[2021/12/26 20:12:44] root INFO: Saving model checkpoint to ./output/ser/best_model\n",
      "[2021/12/26 20:12:44] root INFO: [epoch 0/200][iter: 9/19] results: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n",
      "[2021/12/26 20:12:44] root INFO: best metrics: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n",
      "^C\n",
      "Traceback (most recent call last):\n",
      "  File \"train_ser.py\", line 248, in <module>\n",
      "    train(args)\n",
      "  File \"train_ser.py\", line 178, in train\n",
      "    loss = loss_class(labels, outputs, batch['attention_mask'])\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\", line 914, in __call__\n",
      "    outputs = self.forward(*inputs, **kwargs)\n",
      "  File \"/home/aistudio/PaddleOCR/ppstructure/vqa/losses.py\", line 29, in forward\n",
      "    [-1, self.num_classes])[active_loss]\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/varbase_patch_methods.py\", line 594, in __getitem__\n",
      "    return _getitem_impl_(self, item)\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/variable_index.py\", line 403, in _getitem_impl_\n",
      "    bool_2_idx = where(slice_item == True)\n",
      "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layers/nn.py\", line 14242, in where\n",
      "    return _C_ops.where_index(condition)\n",
      "KeyboardInterrupt\n"
     ]
    }
   ],
   "source": [
    "! python train_ser.py \\\n",
    "    --model_name_or_path \"layoutxlm-base-uncased\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --train_data_dir \"XFUND/zh_train/image\" \\\n",
    "    --train_label_path \"XFUND/zh_train/xfun_normalize_train.json\" \\\n",
    "    --eval_data_dir \"XFUND/zh_val/image\" \\\n",
    "    --eval_label_path \"XFUND/zh_val/xfun_normalize_val.json\" \\\n",
    "    --per_gpu_train_batch_size 8 \\\n",
    "    --per_gpu_eval_batch_size 8 \\\n",
    "    --num_train_epochs 200 \\\n",
    "    --eval_steps 10 \\\n",
    "    --output_dir \"./output/ser/\" \\\n",
    "    --learning_rate 5e-5 \\\n",
    "    --warmup_steps 50 \\\n",
    "    --evaluate_during_training \\\n",
    "    --num_workers 0 \\\n",
    "    --seed 2048"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.4 Model Evaluation\n",
    "\n",
    "During the training process, two models are saved by default, one is the latest trained model named latest, and the other is the most accurate model named best. The folder structure for saving the model is as follows\n",
    "```bash\n",
    "output/ser/\n",
    "├── best_model\n",
    "│ ├── model_config.json # Model configuration\n",
    "│ ├── model_state.pdparams # Model parameters\n",
    "│ ├── sentencepiece.bpe.model # Parameters of tokenizer\n",
    "│ ├── tokenizer_config.json # tokenizer configuration\n",
    "│ └── training_args.bin # Parameters when starting training\n",
    "├── infer_results.txt\n",
    "├── latest_model\n",
    "│   ├── model_config.json\n",
    "│   ├── model_state.pdparams\n",
    "│   ├── sentencepiece.bpe.model\n",
    "│   ├── tokenizer_config.json\n",
    "│   └── training_args.bin\n",
    "├── test_gt.txt\n",
    "├── test_pred.txt\n",
    "└── train.log  # Training log\n",
    "```\n",
    "\n",
    "Next, use the saved model parameters to evaluate the accuracy on the test set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2021/12/26 20:13:05] root INFO: -----------  Configuration Arguments -----------\n",
      "[2021/12/26 20:13:05] root INFO: adam_epsilon: 1e-08\n",
      "[2021/12/26 20:13:05] root INFO: det_model_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: eval_data_dir: XFUND/zh_val/image\n",
      "[2021/12/26 20:13:05] root INFO: eval_label_path: XFUND/zh_val/xfun_normalize_val.json\n",
      "[2021/12/26 20:13:05] root INFO: eval_steps: 10\n",
      "[2021/12/26 20:13:05] root INFO: evaluate_during_training: False\n",
      "[2021/12/26 20:13:05] root INFO: infer_imgs: None\n",
      "[2021/12/26 20:13:05] root INFO: label_map_path: ./labels/labels_ser.txt\n",
      "[2021/12/26 20:13:05] root INFO: learning_rate: 5e-05\n",
      "[2021/12/26 20:13:05] root INFO: max_grad_norm: 1.0\n",
      "[2021/12/26 20:13:05] root INFO: max_seq_length: 512\n",
      "[2021/12/26 20:13:05] root INFO: model_name_or_path: output/ser/best_model\n",
      "[2021/12/26 20:13:05] root INFO: num_train_epochs: 3\n",
      "[2021/12/26 20:13:05] root INFO: num_workers: 8\n",
      "[2021/12/26 20:13:05] root INFO: ocr_json_path: None\n",
      "[2021/12/26 20:13:05] root INFO: output_dir: output/ser/\n",
      "[2021/12/26 20:13:05] root INFO: per_gpu_eval_batch_size: 8\n",
      "[2021/12/26 20:13:05] root INFO: per_gpu_train_batch_size: 8\n",
      "[2021/12/26 20:13:05] root INFO: re_model_name_or_path: None\n",
      "[2021/12/26 20:13:05] root INFO: rec_model_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: resume: False\n",
      "[2021/12/26 20:13:05] root INFO: seed: 2048\n",
      "[2021/12/26 20:13:05] root INFO: ser_model_type: LayoutXLM\n",
      "[2021/12/26 20:13:05] root INFO: train_data_dir: None\n",
      "[2021/12/26 20:13:05] root INFO: train_label_path: None\n",
      "[2021/12/26 20:13:05] root INFO: warmup_steps: 0\n",
      "[2021/12/26 20:13:05] root INFO: weight_decay: 0.0\n",
      "[2021/12/26 20:13:05] root INFO: ------------------------------------------------\n",
      "W1226 20:13:05.816488  1230 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:13:05.820412  1230 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "Corrupt JPEG data: premature end of data segment\n",
      "[2021/12/26 20:13:18] root INFO: [Eval]process: 0/7, loss: 1.41839\n",
      "[2021/12/26 20:13:18] root INFO: [Eval]process: 1/7, loss: 1.60403\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 2/7, loss: 1.70345\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 3/7, loss: 1.60751\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 4/7, loss: 1.49639\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 5/7, loss: 1.66062\n",
      "[2021/12/26 20:13:19] root INFO: [Eval]process: 6/7, loss: 1.56035\n",
      "[2021/12/26 20:13:20] root INFO: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "      ANSWER       0.01      0.01      0.01      1514\n",
      "      HEADER       0.00      0.00      0.00        58\n",
      "    QUESTION       0.03      0.02      0.02      1155\n",
      "\n",
      "   micro avg       0.02      0.01      0.01      2727\n",
      "   macro avg       0.01      0.01      0.01      2727\n",
      "weighted avg       0.02      0.01      0.01      2727\n",
      "\n",
      "[2021/12/26 20:13:20] root INFO: ***** Eval results  *****\n",
      "[2021/12/26 20:13:20] root INFO:   f1 = 0.013078227173649792\n",
      "[2021/12/26 20:13:20] root INFO:   loss = 1.5786780970437186\n",
      "[2021/12/26 20:13:20] root INFO:   precision = 0.01925820256776034\n",
      "[2021/12/26 20:13:20] root INFO:   recall = 0.009900990099009901\n",
      "[2021/12/26 20:13:20] root INFO: {'loss': 1.5786780970437186, 'precision': 0.01925820256776034, 'recall': 0.009900990099009901, 'f1': 0.013078227173649792}\n"
     ]
    }
   ],
   "source": [
    "! python eval_ser.py \\\n",
    "    --model_name_or_path \"output/ser/best_model\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --eval_data_dir \"XFUND/zh_val/image\" \\\n",
    "    --eval_label_path \"XFUND/zh_val/xfun_normalize_val.json\" \\\n",
    "    --per_gpu_eval_batch_size 8 \\\n",
    "    --num_workers 8 \\\n",
    "    --output_dir \"output/ser/\"  \\\n",
    "    --seed 2048"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.5 Model Prediction\n",
    "\n",
    "After training the model, you can also use the saved model to perform model inference on a single picture or an image in a folder, and observe the prediction effect of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "W1226 20:07:23.831934   640 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1\n",
      "W1226 20:07:23.835953   640 device_context.cc:465] device: 0, cuDNN Version: 7.6.\n",
      "[2021/12/26 20:07:33] root WARNING: version PP-OCRv2 not support cls models, auto switch to version PP-OCR\n",
      "Namespace(benchmark=False, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/det/ch/ch_PP-OCRv2_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='box', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_polygon=True, e2e_pgnet_score_thresh=0.5, e2e_pgnet_valid_set='totaltext', enable_mkldnn=False, gpu_mem=500, help='==SUPPRESS==', image_dir=None, ir_optim=True, label_list=['0', '180'], lang='ch', layout_path_model='lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config', max_batch_size=10, max_text_length=25, min_subgraph_size=15, ocr_version='PP-OCRv2', output='./output/table', precision='fp32', process_id=0, rec=True, rec_algorithm='CRNN', rec_batch_num=6, rec_char_dict_path='/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleocr/ppocr/utils/ppocr_keys_v1.txt', rec_image_shape='3, 32, 320', rec_model_dir='/home/aistudio/.paddleocr/2.3.0.2/ocr/rec/ch/ch_PP-OCRv2_rec_infer', save_log_path='./log_output/', show_log=False, structure_version='STRUCTURE', table_char_dict_path=None, table_char_type='en', table_max_len=488, table_model_dir=None, total_process_num=1, type='ocr', use_angle_cls=False, use_dilation=False, use_gpu=True, use_mp=False, use_onnx=False, use_pdserving=False, use_space_char=True, use_tensorrt=False, vis_font_path='./doc/fonts/simfang.ttf', warmup=True)\n",
      "process: [0/1], save result to output/ser_e2e/zh_val_42_ser.jpg\n",
      "Corrupt JPEG data: premature end of data segment\n"
     ]
    }
   ],
   "source": [
    "! python infer_ser_e2e.py \\\n",
    "    --model_name_or_path \"./inference/PP-Layout_v1.0_ser_pretrained/\" \\\n",
    "    --ser_model_type \"LayoutXLM\" \\\n",
    "    --max_seq_length 512 \\\n",
    "    --output_dir \"output/ser_e2e/\" \\\n",
    "    --infer_imgs \"images/input/zh_val_42.jpg\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4 Assignment\n",
    "\n",
    "Experimental questions\n",
    "\n",
    "[https://aistudio.baidu.com/aistudio/projectdetail/3281385](https://aistudio.baidu.com/aistudio/projectdetail/3281385)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "py35-paddle1.2.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}