PaddleOCR/ppstructure/vqa/xfun.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import cv2
import numpy as np
import paddle
import copy
from paddle.io import Dataset

__all__ = ["XFUNDataset"]


class XFUNDataset(Dataset):
    """
    Example:
        print("=====begin to build dataset=====")
        from paddlenlp.transformers import LayoutXLMTokenizer
        tokenizer = LayoutXLMTokenizer.from_pretrained("/paddle/models/transformers/layoutxlm-base-paddle/")
        tok_res = tokenizer.tokenize("Maribyrnong")
        # res = tokenizer.convert_ids_to_tokens(val_data["input_ids"][0])
        dataset = XfunDatasetForSer(
            tokenizer,
            data_dir="./zh.val/",
            label_path="zh.val/xfun_normalize_val.json",
            img_size=(224,224))
        print(len(dataset))

        data = dataset[0]
        print(data.keys())
        print("input_ids: ", data["input_ids"])
        print("labels: ", data["labels"])
        print("token_type_ids: ", data["token_type_ids"])
        print("words_list: ", data["words_list"])
        print("image shape: ", data["image"].shape)
    """

    def __init__(self,
                 tokenizer,
                 data_dir,
                 label_path,
                 contains_re=False,
                 label2id_map=None,
                 img_size=(224, 224),
                 pad_token_label_id=None,
                 add_special_ids=False,
                 return_attention_mask=True,
                 load_mode='all',
                 max_seq_len=512):
        super().__init__()
        self.tokenizer = tokenizer
        self.data_dir = data_dir
        self.label_path = label_path
        self.contains_re = contains_re
        self.label2id_map = label2id_map
        self.img_size = img_size
        self.pad_token_label_id = pad_token_label_id
        self.add_special_ids = add_special_ids
        self.return_attention_mask = return_attention_mask
        self.load_mode = load_mode
        self.max_seq_len = max_seq_len

        if self.pad_token_label_id is None:
            self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

        self.all_lines = self.read_all_lines()

        self.entities_labels = {'HEADER': 0, 'QUESTION': 1, 'ANSWER': 2}
        self.return_keys = {
            'bbox': {
                'type': 'np',
                'dtype': 'int64'
            },
            'input_ids': {
                'type': 'np',
                'dtype': 'int64'
            },
            'labels': {
                'type': 'np',
                'dtype': 'int64'
            },
            'attention_mask': {
                'type': 'np',
                'dtype': 'int64'
            },
            'image': {
                'type': 'np',
                'dtype': 'float32'
            },
            'token_type_ids': {
                'type': 'np',
                'dtype': 'int64'
            },
            'entities': {
                'type': 'dict'
            },
            'relations': {
                'type': 'dict'
            }
        }

        if load_mode == "all":
            self.encoded_inputs_all = self._parse_label_file_all()

    def pad_sentences(self,
                      encoded_inputs,
                      max_seq_len=512,
                      pad_to_max_seq_len=True,
                      return_attention_mask=True,
                      return_token_type_ids=True,
                      truncation_strategy="longest_first",
                      return_overflowing_tokens=False,
                      return_special_tokens_mask=False):
        # Padding
        needs_to_be_padded = pad_to_max_seq_len and \
            max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len

        if needs_to_be_padded:
            difference = max_seq_len - len(encoded_inputs["input_ids"])
            if self.tokenizer.padding_side == 'right':
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
                        "input_ids"]) + [0] * difference
                if return_token_type_ids:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] +
                        [self.tokenizer.pad_token_type_id] * difference)
                if return_special_tokens_mask:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs[
                        "special_tokens_mask"] + [1] * difference
                encoded_inputs["input_ids"] = encoded_inputs[
                    "input_ids"] + [self.tokenizer.pad_token_id] * difference
                encoded_inputs["labels"] = encoded_inputs[
                    "labels"] + [self.pad_token_label_id] * difference
                encoded_inputs["bbox"] = encoded_inputs[
                    "bbox"] + [[0, 0, 0, 0]] * difference
            elif self.tokenizer.padding_side == 'left':
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [0] * difference + [
                        1
                    ] * len(encoded_inputs["input_ids"])
                if return_token_type_ids:
                    encoded_inputs["token_type_ids"] = (
                        [self.tokenizer.pad_token_type_id] * difference +
                        encoded_inputs["token_type_ids"])
                if return_special_tokens_mask:
                    encoded_inputs["special_tokens_mask"] = [
                        1
                    ] * difference + encoded_inputs["special_tokens_mask"]
                encoded_inputs["input_ids"] = [
                    self.tokenizer.pad_token_id
                ] * difference + encoded_inputs["input_ids"]
                encoded_inputs["labels"] = [
                    self.pad_token_label_id
                ] * difference + encoded_inputs["labels"]
                encoded_inputs["bbox"] = [
                    [0, 0, 0, 0]
                ] * difference + encoded_inputs["bbox"]
        else:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
                    "input_ids"])

        return encoded_inputs

    def truncate_inputs(self, encoded_inputs, max_seq_len=512):
        for key in encoded_inputs:
            if key == "sample_id":
                continue
            length = min(len(encoded_inputs[key]), max_seq_len)
            encoded_inputs[key] = encoded_inputs[key][:length]
        return encoded_inputs

    def read_all_lines(self, ):
        with open(self.label_path, "r", encoding='utf-8') as fin:
            lines = fin.readlines()
        return lines

    def _parse_label_file_all(self):
        """
        parse all samples
        """
        encoded_inputs_all = []
        for line in self.all_lines:
            encoded_inputs_all.extend(self._parse_label_file(line))
        return encoded_inputs_all

    def _parse_label_file(self, line):
        """
        parse single sample
        """

        image_name, info_str = line.split("\t")
        image_path = os.path.join(self.data_dir, image_name)

        def add_imgge_path(x):
            x['image_path'] = image_path
            return x

        encoded_inputs = self._read_encoded_inputs_sample(info_str)
        if self.contains_re:
            encoded_inputs = self._chunk_re(encoded_inputs)
        else:
            encoded_inputs = self._chunk_ser(encoded_inputs)
        encoded_inputs = list(map(add_imgge_path, encoded_inputs))
        return encoded_inputs

    def _read_encoded_inputs_sample(self, info_str):
        """
        parse label info
        """
        # read text info
        info_dict = json.loads(info_str)
        height = info_dict["height"]
        width = info_dict["width"]

        words_list = []
        bbox_list = []
        input_ids_list = []
        token_type_ids_list = []
        gt_label_list = []

        if self.contains_re:
            # for re
            entities = []
            relations = []
            id2label = {}
            entity_id_to_index_map = {}
            empty_entity = set()
        for info in info_dict["ocr_info"]:
            if self.contains_re:
                # for re
                if len(info["text"]) == 0:
                    empty_entity.add(info["id"])
                    continue
                id2label[info["id"]] = info["label"]
                relations.extend([tuple(sorted(l)) for l in info["linking"]])

            # x1, y1, x2, y2
            bbox = info["bbox"]
            label = info["label"]
            bbox[0] = int(bbox[0] * 1000.0 / width)
            bbox[2] = int(bbox[2] * 1000.0 / width)
            bbox[1] = int(bbox[1] * 1000.0 / height)
            bbox[3] = int(bbox[3] * 1000.0 / height)

            text = info["text"]
            encode_res = self.tokenizer.encode(
                text, pad_to_max_seq_len=False, return_attention_mask=True)

            gt_label = []
            if not self.add_special_ids:
                # TODO: use tok.all_special_ids to remove
                encode_res["input_ids"] = encode_res["input_ids"][1:-1]
                encode_res["token_type_ids"] = encode_res["token_type_ids"][1:
                                                                            -1]
                encode_res["attention_mask"] = encode_res["attention_mask"][1:
                                                                            -1]
            if label.lower() == "other":
                gt_label.extend([0] * len(encode_res["input_ids"]))
            else:
                gt_label.append(self.label2id_map[("b-" + label).upper()])
                gt_label.extend([self.label2id_map[("i-" + label).upper()]] *
                                (len(encode_res["input_ids"]) - 1))
            if self.contains_re:
                if gt_label[0] != self.label2id_map["O"]:
                    entity_id_to_index_map[info["id"]] = len(entities)
                    entities.append({
                        "start": len(input_ids_list),
                        "end":
                        len(input_ids_list) + len(encode_res["input_ids"]),
                        "label": label.upper(),
                    })
            input_ids_list.extend(encode_res["input_ids"])
            token_type_ids_list.extend(encode_res["token_type_ids"])
            bbox_list.extend([bbox] * len(encode_res["input_ids"]))
            gt_label_list.extend(gt_label)
            words_list.append(text)

        encoded_inputs = {
            "input_ids": input_ids_list,
            "labels": gt_label_list,
            "token_type_ids": token_type_ids_list,
            "bbox": bbox_list,
            "attention_mask": [1] * len(input_ids_list),
            # "words_list": words_list,
        }
        encoded_inputs = self.pad_sentences(
            encoded_inputs,
            max_seq_len=self.max_seq_len,
            return_attention_mask=self.return_attention_mask)
        encoded_inputs = self.truncate_inputs(encoded_inputs)

        if self.contains_re:
            relations = self._relations(entities, relations, id2label,
                                        empty_entity, entity_id_to_index_map)
            encoded_inputs['relations'] = relations
            encoded_inputs['entities'] = entities
        return encoded_inputs

    def _chunk_ser(self, encoded_inputs):
        encoded_inputs_all = []
        seq_len = len(encoded_inputs['input_ids'])
        chunk_size = 512
        for chunk_id, index in enumerate(range(0, seq_len, chunk_size)):
            chunk_beg = index
            chunk_end = min(index + chunk_size, seq_len)
            encoded_inputs_example = {}
            for key in encoded_inputs:
                encoded_inputs_example[key] = encoded_inputs[key][chunk_beg:
                                                                  chunk_end]

            encoded_inputs_all.append(encoded_inputs_example)
        return encoded_inputs_all

    def _chunk_re(self, encoded_inputs):
        # prepare data
        entities = encoded_inputs.pop('entities')
        relations = encoded_inputs.pop('relations')
        encoded_inputs_all = []
        chunk_size = 512
        for chunk_id, index in enumerate(
                range(0, len(encoded_inputs["input_ids"]), chunk_size)):
            item = {}
            for k in encoded_inputs:
                item[k] = encoded_inputs[k][index:index + chunk_size]

            # select entity in current chunk
            entities_in_this_span = []
            global_to_local_map = {}  #
            for entity_id, entity in enumerate(entities):
                if (index <= entity["start"] < index + chunk_size and
                        index <= entity["end"] < index + chunk_size):
                    entity["start"] = entity["start"] - index
                    entity["end"] = entity["end"] - index
                    global_to_local_map[entity_id] = len(entities_in_this_span)
                    entities_in_this_span.append(entity)

            # select relations in current chunk
            relations_in_this_span = []
            for relation in relations:
                if (index <= relation["start_index"] < index + chunk_size and
                        index <= relation["end_index"] < index + chunk_size):
                    relations_in_this_span.append({
                        "head": global_to_local_map[relation["head"]],
                        "tail": global_to_local_map[relation["tail"]],
                        "start_index": relation["start_index"] - index,
                        "end_index": relation["end_index"] - index,
                    })
            item.update({
                "entities": reformat(entities_in_this_span),
                "relations": reformat(relations_in_this_span),
            })
            item['entities']['label'] = [
                self.entities_labels[x] for x in item['entities']['label']
            ]
            encoded_inputs_all.append(item)
        return encoded_inputs_all

    def _relations(self, entities, relations, id2label, empty_entity,
                   entity_id_to_index_map):
        """
        build relations
        """
        relations = list(set(relations))
        relations = [
            rel for rel in relations
            if rel[0] not in empty_entity and rel[1] not in empty_entity
        ]
        kv_relations = []
        for rel in relations:
            pair = [id2label[rel[0]], id2label[rel[1]]]
            if pair == ["question", "answer"]:
                kv_relations.append({
                    "head": entity_id_to_index_map[rel[0]],
                    "tail": entity_id_to_index_map[rel[1]]
                })
            elif pair == ["answer", "question"]:
                kv_relations.append({
                    "head": entity_id_to_index_map[rel[1]],
                    "tail": entity_id_to_index_map[rel[0]]
                })
            else:
                continue
        relations = sorted(
            [{
                "head": rel["head"],
                "tail": rel["tail"],
                "start_index": get_relation_span(rel, entities)[0],
                "end_index": get_relation_span(rel, entities)[1],
            } for rel in kv_relations],
            key=lambda x: x["head"], )
        return relations

    def load_img(self, image_path):
        # read img
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        resize_h, resize_w = self.img_size
        im_shape = img.shape[0:2]
        im_scale_y = resize_h / im_shape[0]
        im_scale_x = resize_w / im_shape[1]
        img_new = cv2.resize(
            img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=2)
        mean = np.array([0.485, 0.456, 0.406])[np.newaxis, np.newaxis, :]
        std = np.array([0.229, 0.224, 0.225])[np.newaxis, np.newaxis, :]
        img_new = img_new / 255.0
        img_new -= mean
        img_new /= std
        img = img_new.transpose((2, 0, 1))
        return img

    def __getitem__(self, idx):
        if self.load_mode == "all":
            data = copy.deepcopy(self.encoded_inputs_all[idx])
        else:
            data = self._parse_label_file(self.all_lines[idx])[0]

        image_path = data.pop('image_path')
        data["image"] = self.load_img(image_path)

        return_data = {}
        for k, v in data.items():
            if k in self.return_keys:
                if self.return_keys[k]['type'] == 'np':
                    v = np.array(v, dtype=self.return_keys[k]['dtype'])
                return_data[k] = v
        return return_data

    def __len__(self, ):
        if self.load_mode == "all":
            return len(self.encoded_inputs_all)
        else:
            return len(self.all_lines)


def get_relation_span(rel, entities):
    bound = []
    for entity_index in [rel["head"], rel["tail"]]:
        bound.append(entities[entity_index]["start"])
        bound.append(entities[entity_index]["end"])
    return min(bound), max(bound)


def reformat(data):
    new_data = {}
    for item in data:
        for k, v in item.items():
            if k not in new_data:
                new_data[k] = []
            new_data[k].append(v)
    return new_data