modified pr

2025-06-03 21:53:39 +08:00 · 2022-07-10 09:20:59 +08:00 · 2022-07-10 09:20:59 +08:00 · 1cda437c4d
commit 1cda437c4d
parent bbca1e0d66
7 changed files with 13 additions and 119 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@ inference/
 inference_results/
 output/
 train_data/
 log/
 *.DS_Store
 *.vs
 *.user
--- a/configs/rec/rec_r32_gaspin_bilstm_att.yml
+++ b/configs/rec/rec_r32_gaspin_bilstm_att.yml
@ -61,7 +61,6 @@ Loss:
 PostProcess:
  name: SPINAttnLabelDecode
  character_dict_path: ./ppocr/utils/dict/spin_dict.txt
  use_space_char: False
--- a/ppocr/losses/rec_spin_att_loss.py
+++ b/ppocr/losses/rec_spin_att_loss.py
@ -1,4 +1,4 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -19,6 +20,9 @@ from __future__ import print_function
 import paddle
 from paddle import nn
 '''This code is refer from:
 https://github.com/hikopensource/DAVAR-Lab-OCR
 '''
 class SPINAttentionLoss(nn.Layer):
    def __init__(self, reduction='mean', ignore_index=-100, **kwargs):
--- a/ppocr/modeling/heads/rec_spin_att_head.py
+++ b/ppocr/modeling/heads/rec_spin_att_head.py
@ -1,4 +1,4 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -80,98 +80,6 @@ class SPINAttentionHead(nn.Layer):
        return probs
 class AttentionGRUCell(nn.Layer):
    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
        super(AttentionGRUCell, self).__init__()
        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
        self.rnn = nn.GRUCell(
            input_size=input_size + num_embeddings, hidden_size=hidden_size)
        self.hidden_size = hidden_size
    def forward(self, prev_hidden, batch_H, char_onehots):
        batch_H_proj = self.i2h(batch_H)
        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
        res = paddle.add(batch_H_proj, prev_hidden_proj)
        res = paddle.tanh(res)
        e = self.score(res)
        alpha = F.softmax(e, axis=1)
        alpha = paddle.transpose(alpha, [0, 2, 1])
        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
        concat_context = paddle.concat([context, char_onehots], 1)
        cur_hidden = self.rnn(concat_context, prev_hidden)
        return cur_hidden, alpha
 class AttentionLSTM(nn.Layer):
    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
        super(AttentionLSTM, self).__init__()
        self.input_size = in_channels
        self.hidden_size = hidden_size
        self.num_classes = out_channels
        self.attention_cell = AttentionLSTMCell(
            in_channels, hidden_size, out_channels, use_gru=False)
        self.generator = nn.Linear(hidden_size, out_channels)
    def _char_to_onehot(self, input_char, onehot_dim):
        input_ont_hot = F.one_hot(input_char, onehot_dim)
        return input_ont_hot
    def forward(self, inputs, targets=None, batch_max_length=25):
        batch_size = inputs.shape[0]
        num_steps = batch_max_length
        hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
            (batch_size, self.hidden_size)))
        output_hiddens = []
        if targets is not None:
            for i in range(num_steps):
                # one-hot vectors for a i-th char
                char_onehots = self._char_to_onehot(
                    targets[:, i], onehot_dim=self.num_classes)
                hidden, alpha = self.attention_cell(hidden, inputs,
                                                    char_onehots)
                hidden = (hidden[1][0], hidden[1][1])
                output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
            output = paddle.concat(output_hiddens, axis=1)
            probs = self.generator(output)
        else:
            targets = paddle.zeros(shape=[batch_size], dtype="int32")
            probs = None
            for i in range(num_steps):
                char_onehots = self._char_to_onehot(
                    targets, onehot_dim=self.num_classes)
                hidden, alpha = self.attention_cell(hidden, inputs,
                                                    char_onehots)
                probs_step = self.generator(hidden[0])
                hidden = (hidden[1][0], hidden[1][1])
                if probs is None:
                    probs = paddle.unsqueeze(probs_step, axis=1)
                else:
                    probs = paddle.concat(
                        [probs, paddle.unsqueeze(
                            probs_step, axis=1)], axis=1)
                next_input = probs_step.argmax(axis=1)
                targets = next_input
        return probs
 class AttentionLSTMCell(nn.Layer):
    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
        super(AttentionLSTMCell, self).__init__()
--- a/ppocr/modeling/necks/rnn.py
+++ b/ppocr/modeling/necks/rnn.py
@ -70,17 +70,6 @@ class BidirectionalLSTM(nn.Layer):
            self.linear = nn.Linear(hidden_size * 2, output_size)
    def forward(self, input_feature):
        """
        Args:
            input_feature (Torch.Tensor): visual feature [batch_size x T x input_size]
        Returns:
            Torch.Tensor: LSTM output contextual feature [batch_size x T x output_size]
        """
        # self.rnn.flatten_parameters() # error in export_model
        recurrent, _ = self.rnn(input_feature)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
        if self.with_linear:
            output = self.linear(recurrent)     # batch_size x T x output_size
--- a/ppocr/modeling/transforms/gaspin_transformer.py
+++ b/ppocr/modeling/transforms/gaspin_transformer.py
@ -1,4 +1,4 @@
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -71,14 +71,14 @@ class SP_TransformerNetwork(nn.Layer):
        """
        Args:
-            batch_I (torch.Tensor): batch of input images [batch_size x nc x I_height x I_width]
+            batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width]
            weights:
            offsets: the predicted offset by AIN, a scalar
            lambda_color: the learnable update gate \alpha in Equa. (5) as
                          g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets}
        Returns:
-            torch.Tensor: transformed images by SPN as Equa. (4) in Ref. [1]
+            Tensor: transformed images by SPN as Equa. (4) in Ref. [1]
                        [batch_size x I_channel_num x I_r_height x I_r_width]
        """
@ -114,8 +114,6 @@ class GA_SPIN_Transformer(nn.Layer):
            in_channels (int): channel of input features,
                                set it to 1 if the grayscale images and 3 if RGB input
            I_r_size (tuple): size of rectified images (used in STN transformations)
            inputDataType (str): the type of input data,
                                only support 'torch.cuda.FloatTensor' this version
            offsets (bool): set it to False if use SPN w.o. AIN,
                            and set it to True if use SPIN (both with SPN and AIN)
            norm_type (str): the normalization type of the module,
@ -123,6 +121,7 @@ class GA_SPIN_Transformer(nn.Layer):
            default_type (int): the K chromatic space,
                                set it to 3/5/6 depend on the complexity of transformation intensities
            loc_lr (float): learning rate of location network
            stn (bool): whther to use stn.
        """
        super(GA_SPIN_Transformer, self).__init__()
@ -233,12 +232,12 @@ class GA_SPIN_Transformer(nn.Layer):
    def forward(self, x, return_weight=False):
        """
        Args:
-            x (torch.cuda.FloatTensor): input image batch
+            x (Tensor): input image batch
            return_weight (bool): set to False by default,
                                  if set to True return the predicted offsets of AIN, denoted as x_{offsets}
        Returns:
-            torch.Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size
+            Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size
        """
        if self.spt:
--- a/tools/export_model.py
+++ b/tools/export_model.py
@ -73,12 +73,6 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
                shape=[None, 3, 64, 512], dtype="float32"),
        ]
        model = to_static(model, input_spec=other_shape)
    elif arch_config["algorithm"] == "SPIN":
        other_shape = [
            paddle.static.InputSpec(
                shape=[None, 1, 32, 100], dtype="float32"),
        ]
        model = to_static(model, input_spec=other_shape)
    else:
        infer_shape = [3, -1, -1]
        if arch_config["model_type"] == "rec":