detect.py: support of txt file list

FL with configured weighted loss : alpha test.py: tSNE over the cropped bbox features
2025-02-16 10:05:06 +02:00 · 2025-02-16 10:05:06 +02:00 · 78cd0cb8f6
parent 6b8d33fa31
commit 78cd0cb8f6
2 changed files with 161 additions and 0 deletions
--- a/data/hyp.tir_od.tiny_aug_gamma_scaling_before_mosaic_rnd_scaling_no_ota.yaml
+++ b/data/hyp.tir_od.tiny_aug_gamma_scaling_before_mosaic_rnd_scaling_no_ota.yaml
@ -0,0 +1,37 @@
 lr0: 0.001 #0.001  # initial learning rate (SGD=1E-2, Adam=1E-3)
 lrf: 0.01  # final OneCycleLR learning rate (lr0 * lrf)
 momentum: 0.937  # SGD momentum/Adam beta1
 weight_decay: 0.005  # optimizer weight decay 5e-4  It resolve mAP of overfitting test
 warmup_epochs: 3.0  # warmup epochs (fractions ok)
 warmup_momentum: 0.8  # warmup initial momentum
 warmup_bias_lr: 0.001 #0.001  # warmup initial bias lr
 loss_ota: 0 #1 # use ComputeLossOTA, use 0 for faster training
 box: 0.05  # box loss gain
 cls: 0.5  # cls loss gain
 cls_pw: 1.0  # cls BCELoss positive_weight
 obj: 1.0  # obj loss gain (scale with pixels)
 obj_pw: 1.0  # obj BCELoss positive_weight
 iou_t: 0.6  # like the default in the code was 0.2 IoU training threshold
 anchor_t: 4.0  # anchor-multiple threshold
 anchors: 2  # HK TODO modify to 3    ------------------------   anchors per output layer (0 to ignore)  @@HK was 3
 fl_gamma: 3.0 #1.5  # focal loss gamma (efficientDet default gamma=1.5)
 hsv_h: 0.0  # image HSV-Hue augmentation (fraction)
 hsv_s: 0.0  # image HSV-Saturation augmentation (fraction)
 hsv_v: 0.0  # image HSV-Value augmentation (fraction)
 degrees: 0 # image rotation (+/- deg)
 translate: 0.2 #0.2  # image translation (+/- fraction)
 scale: 0.5  # image scale (+/- gain)
 shear: 0.0  # image shear (+/- deg)
 perspective: 0.0  # image perspective (+/- fraction), range 0-0.001
 flipud: 0.3  # image flip up-down (probability)
 fliplr: 0.5  # image flip left-right (probability)
 mosaic: 0.5  # image mosaic (probability)
 mixup: 0.15  # image mixup (probability)
 copy_paste: 0.0  # image copy paste (probability)
 paste_in: 0.1 # 0.1  # image copy paste (probability), use 0 for faster training : cutout
 inversion: 0.5 #opposite temperature
 img_percentile_removal: 0.3
 beta : 0.3
 random_perspective : 1
 scaling_before_mosaic : 1
 gamma : 80   # percent 90 percente more stability to gamma
--- a/yolo_object_embeddings.py
+++ b/yolo_object_embeddings.py
@ -0,0 +1,124 @@
 import torch
 import torch.nn.functional as F
 import numpy as np
 from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
 import torchvision.ops as ops
 import os
 class ObjectEmbeddingVisualizer:
    def __init__(self, model, device):
        # # model_type = 'yolov7'
        # # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # self.model = torch.hub.load('WongKinYiu/yolov7' if model_type == 'yolov7' else 'ultralytics/yolov5',
        #                           'custom' if model_type == 'yolov7' else 'yolov5s')
        # self.model.to(self.device).eval()
        self.model = model
        self.device = device
    def extract_object_features(self, image, predictions):
        with torch.no_grad():
            # Get feature maps
            if hasattr(self.model, 'model'):
                feature_maps = self.model.model.backbone(image.to(self.device))
            else:
                feature_maps = self.model.backbone(image.to(self.device))
            # Get boxes and labels
            boxes = predictions[0].boxes.xyxy  # x1, y1, x2, y2
            labels = predictions[0].boxes.cls
            object_features = []
            for scale_idx, feat_map in enumerate(feature_maps):
                # Calculate scale ratio
                scale_h = image.shape[2] / feat_map.shape[2]
                scale_w = image.shape[3] / feat_map.shape[3]
                # Scale boxes to feature map size
                scaled_boxes = boxes.clone()
                scaled_boxes[:, [0, 2]] = scaled_boxes[:, [0, 2]] / scale_w
                scaled_boxes[:, [1, 3]] = scaled_boxes[:, [1, 3]] / scale_h
                # ROI pooling
                roi_features = ops.roi_batchool(feat_map, [scaled_boxes.to(self.device)],
                                         output_size=(7, 7))
                # Global average pooling
                pooled_features = F.adaptive_avg_pool2d(roi_features, (1, 1))
                object_features.append(pooled_features.squeeze(-1).squeeze(-1))
            # Concatenate features from all scales
            all_features = torch.cat(object_features, dim=1)
            return all_features.cpu().numpy(), labels.cpu().numpy()
    def extract_object_grounded_features(self, feature_maps, predictions, image_shape: tuple):
        scale = 2
        assert len(image_shape) == 4, 'image shape should be tensor [ch, h, w]'
        embeddings = dict()
        object_cls = list()#dict()
        object_features = []
        try :
            for i_batch, pred in enumerate(predictions):
                # object_features = list()
                for scale_idx, feat_map_all_batches in enumerate(feature_maps): # run over all 3 FM of 3 scales in all batches
                    if scale_idx != scale:
                        continue # take only the last scale
                    feat_map = feat_map_all_batches[i_batch, :, :,:]
                    boxes = pred[:,:4]  # x1, y1, x2, y2
                    labels = pred[:, 5]
                    # Calculate scale ratio
                    scale_h = image_shape[2] / feat_map.shape[1]
                    scale_w = image_shape[3] / feat_map.shape[2]
                    # Scale boxes to feature map size
                    scaled_boxes = boxes.clone()
                    scaled_boxes[:, [0, 2]] = scaled_boxes[:, [0, 2]] / scale_w
                    scaled_boxes[:, [1, 3]] = scaled_boxes[:, [1, 3]] / scale_h
                    # ROI pooling
                    roi_features = ops.roi_pool(feat_map.float()[None,...], [scaled_boxes.to(self.device)],
                                                output_size=(7, 7))
                    # Global average pooling
                    pooled_features = F.adaptive_avg_pool2d(roi_features, (1, 1))
                    # object_features.append(pooled_features.squeeze(-1).squeeze(-1))
                    [object_features.append(x.squeeze(-1).squeeze(-1)[None,...]) for x in pooled_features]
                    [object_cls.append(x.cpu().numpy()) for x in labels]
                # Concatenate features from all scales
            all_features = torch.cat(object_features, dim=0)
            object_cls = np.array(object_cls)
        except Exception as e:
            raise Exception(f'{i_batch}Error loading data from {i_batch}: {e}\nSee {i_batch}')
            # embeddings.update({i_batch : all_features.cpu().numpy()})
            # object_cls.update({i_batch : labels.cpu().numpy()})
        return all_features, object_cls
    def visualize_object_embeddings(self, features, labels, path, tag=''):
        tsne = TSNE(n_components=2, perplexity=min(30, len(features)-1))
        embeddings_2d = tsne.fit_transform(features.cpu().numpy())
        plt.figure(figsize=(10, 10))
        # scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
        #                     c=labels, cmap='tab20')
        scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
                            c=labels)
        plt.colorbar(scatter, label='Object Class')
        plt.title('Object Embeddings labels support {} , classes {}'.format(features.shape[0],
                                                                            np.unique(labels).size))
        plt.show()
        plt.savefig(os.path.join(path,'tsne' + str(tag) + '.png'))
        return embeddings_2d
    def process_image(self, image_tensor):
        predictions = self.model(image_tensor)
        features, labels = self.extract_object_features(image_tensor, predictions)
        embeddings = self.visualize_object_embeddings(features, labels)
        return embeddings, labels
 """
 # Usage example
 visualizer = ObjectEmbeddingVisualizer()
 # Assuming image_tensor is your input image [1, C, H, W]
 embeddings, labels = visualizer.process_image(image_tensor)
 """