PaddleOCR/ppocr/losses/det_ct_loss.py

# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/shengtao96/CentripetalText/tree/main/models/loss
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
from paddle import nn
import paddle.nn.functional as F
import numpy as np


def ohem_single(score, gt_text, training_mask):
    # online hard example mining

    pos_num = int(paddle.sum(gt_text > 0.5)) - int(
        paddle.sum((gt_text > 0.5) & (training_mask <= 0.5)))

    if pos_num == 0:
        # selected_mask = gt_text.copy() * 0 # may be not good
        selected_mask = training_mask
        selected_mask = paddle.cast(
            selected_mask.reshape(
                (1, selected_mask.shape[0], selected_mask.shape[1])), "float32")
        return selected_mask

    neg_num = int(paddle.sum((gt_text <= 0.5) & (training_mask > 0.5)))
    neg_num = int(min(pos_num * 3, neg_num))

    if neg_num == 0:
        selected_mask = training_mask
        selected_mask = paddle.cast(
            selected_mask.reshape(
                (1, selected_mask.shape[0], selected_mask.shape[1])), "float32")
        return selected_mask

    # hard example
    neg_score = score[(gt_text <= 0.5) & (training_mask > 0.5)]
    neg_score_sorted = paddle.sort(-neg_score)
    threshold = -neg_score_sorted[neg_num - 1]

    selected_mask = ((score >= threshold) |
                     (gt_text > 0.5)) & (training_mask > 0.5)
    selected_mask = paddle.cast(
        selected_mask.reshape(
            (1, selected_mask.shape[0], selected_mask.shape[1])), "float32")
    return selected_mask


def ohem_batch(scores, gt_texts, training_masks):
    selected_masks = []
    for i in range(scores.shape[0]):
        selected_masks.append(
            ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[
                i, :, :]))

    selected_masks = paddle.cast(paddle.concat(selected_masks, 0), "float32")
    return selected_masks


def iou_single(a, b, mask, n_class):
    EPS = 1e-6
    valid = mask == 1
    a = a[valid]
    b = b[valid]
    miou = []

    # iou of each class
    for i in range(n_class):
        inter = paddle.cast(((a == i) & (b == i)), "float32")
        union = paddle.cast(((a == i) | (b == i)), "float32")

        miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS))
    miou = sum(miou) / len(miou)
    return miou


def iou(a, b, mask, n_class=2, reduce=True):
    batch_size = a.shape[0]

    a = a.reshape((batch_size, -1))
    b = b.reshape((batch_size, -1))
    mask = mask.reshape((batch_size, -1))

    iou = paddle.zeros((batch_size, ), dtype="float32")
    for i in range(batch_size):
        iou[i] = iou_single(a[i], b[i], mask[i], n_class)

    if reduce:
        iou = paddle.mean(iou)
    return iou


class DiceLoss(nn.Layer):
    def __init__(self, loss_weight=1.0):
        super(DiceLoss, self).__init__()
        self.loss_weight = loss_weight

    def forward(self, input, target, mask, reduce=True):
        batch_size = input.shape[0]
        input = F.sigmoid(input)  # scale to 0-1

        input = input.reshape((batch_size, -1))
        target = paddle.cast(target.reshape((batch_size, -1)), "float32")
        mask = paddle.cast(mask.reshape((batch_size, -1)), "float32")

        input = input * mask
        target = target * mask

        a = paddle.sum(input * target, axis=1)
        b = paddle.sum(input * input, axis=1) + 0.001
        c = paddle.sum(target * target, axis=1) + 0.001
        d = (2 * a) / (b + c)
        loss = 1 - d

        loss = self.loss_weight * loss

        if reduce:
            loss = paddle.mean(loss)

        return loss


class SmoothL1Loss(nn.Layer):
    def __init__(self, beta=1.0, loss_weight=1.0):
        super(SmoothL1Loss, self).__init__()
        self.beta = beta
        self.loss_weight = loss_weight

        np_coord = np.zeros(shape=[640, 640, 2], dtype=np.int64)
        for i in range(640):
            for j in range(640):
                np_coord[i, j, 0] = j
                np_coord[i, j, 1] = i
        np_coord = np_coord.reshape((-1, 2))

        self.coord = self.create_parameter(
            shape=[640 * 640, 2],
            dtype="int32",  # NOTE: not support "int64" before paddle 2.3.1
            default_initializer=nn.initializer.Assign(value=np_coord))
        self.coord.stop_gradient = True

    def forward_single(self, input, target, mask, beta=1.0, eps=1e-6):
        batch_size = input.shape[0]

        diff = paddle.abs(input - target) * mask.unsqueeze(1)
        loss = paddle.where(diff < beta, 0.5 * diff * diff / beta,
                            diff - 0.5 * beta)
        loss = paddle.cast(loss.reshape((batch_size, -1)), "float32")
        mask = paddle.cast(mask.reshape((batch_size, -1)), "float32")
        loss = paddle.sum(loss, axis=-1)
        loss = loss / (mask.sum(axis=-1) + eps)

        return loss

    def select_single(self, distance, gt_instance, gt_kernel_instance,
                      training_mask):

        with paddle.no_grad():
            # paddle 2.3.1, paddle.slice not support:
            # distance[:, self.coord[:, 1], self.coord[:, 0]]
            select_distance_list = []
            for i in range(2):
                tmp1 = distance[i, :]
                tmp2 = tmp1[self.coord[:, 1], self.coord[:, 0]]
                select_distance_list.append(tmp2.unsqueeze(0))
            select_distance = paddle.concat(select_distance_list, axis=0)

            off_points = paddle.cast(
                self.coord, "float32") + 10 * select_distance.transpose((1, 0))

            off_points = paddle.cast(off_points, "int64")
            off_points = paddle.clip(off_points, 0, distance.shape[-1] - 1)

            selected_mask = (
                gt_instance[self.coord[:, 1], self.coord[:, 0]] !=
                gt_kernel_instance[off_points[:, 1], off_points[:, 0]])
            selected_mask = paddle.cast(
                selected_mask.reshape((1, -1, distance.shape[-1])), "int64")
            selected_training_mask = selected_mask * training_mask

            return selected_training_mask

    def forward(self,
                distances,
                gt_instances,
                gt_kernel_instances,
                training_masks,
                gt_distances,
                reduce=True):

        selected_training_masks = []
        for i in range(distances.shape[0]):
            selected_training_masks.append(
                self.select_single(distances[i, :, :, :], gt_instances[i, :, :],
                                   gt_kernel_instances[i, :, :], training_masks[
                                       i, :, :]))
        selected_training_masks = paddle.cast(
            paddle.concat(selected_training_masks, 0), "float32")

        loss = self.forward_single(distances, gt_distances,
                                   selected_training_masks, self.beta)
        loss = self.loss_weight * loss

        with paddle.no_grad():
            batch_size = distances.shape[0]
            false_num = selected_training_masks.reshape((batch_size, -1))
            false_num = false_num.sum(axis=-1)
            total_num = paddle.cast(
                training_masks.reshape((batch_size, -1)), "float32")
            total_num = total_num.sum(axis=-1)
            iou_text = (total_num - false_num) / (total_num + 1e-6)

        if reduce:
            loss = paddle.mean(loss)

        return loss, iou_text


class CTLoss(nn.Layer):
    def __init__(self):
        super(CTLoss, self).__init__()
        self.kernel_loss = DiceLoss()
        self.loc_loss = SmoothL1Loss(beta=0.1, loss_weight=0.05)

    def forward(self, preds, batch):
        imgs = batch[0]
        out = preds['maps']
        gt_kernels, training_masks, gt_instances, gt_kernel_instances, training_mask_distances, gt_distances = batch[
            1:]

        kernels = out[:, 0, :, :]
        distances = out[:, 1:, :, :]

        # kernel loss
        selected_masks = ohem_batch(kernels, gt_kernels, training_masks)

        loss_kernel = self.kernel_loss(
            kernels, gt_kernels, selected_masks, reduce=False)

        iou_kernel = iou(paddle.cast((kernels > 0), "int64"),
                         gt_kernels,
                         training_masks,
                         reduce=False)
        losses = dict(loss_kernels=loss_kernel, )

        # loc loss
        loss_loc, iou_text = self.loc_loss(
            distances,
            gt_instances,
            gt_kernel_instances,
            training_mask_distances,
            gt_distances,
            reduce=False)
        losses.update(dict(loss_loc=loss_loc, ))

        loss_all = loss_kernel + loss_loc
        losses = {'loss': loss_all}

        return losses