update imbalanced sampler

Summary: add a new sampler, which is useful for imbalanced or long-tail dataset. This refers to ufoym/imbalanced-dataset-sampler.
2021-04-21 17:05:10 +08:00 · 2021-04-21 17:05:10 +08:00 · 0c8e3d9805
parent bb6ddbf8b1
commit 0c8e3d9805
8 changed files with 79 additions and 9 deletions
--- a/fastreid/data/build.py
+++ b/fastreid/data/build.py
@ -56,6 +56,8 @@ def _train_loader_from_config(cfg, *, train_set=None, transforms=None, sampler=N
        elif sampler_name == "SetReWeightSampler":
            set_weight = cfg.DATALOADER.SET_WEIGHT
            sampler = samplers.SetReWeightSampler(train_set.img_items, mini_batch_size, num_instance, set_weight)
+        elif sampler_name == "ImbalancedDatasetSampler":
+            sampler = samplers.ImbalancedDatasetSampler(train_set.img_items)
        else:
            raise ValueError("Unknown training sampler: {}".format(sampler_name))

--- a/fastreid/data/samplers/init.py
+++ b/fastreid/data/samplers/init.py
@ -6,11 +6,13 @@

 from .triplet_sampler import BalancedIdentitySampler, NaiveIdentitySampler, SetReWeightSampler
 from .data_sampler import TrainingSampler, InferenceSampler
+from .imbalance_sampler import ImbalancedDatasetSampler

 __all__ = [
    "BalancedIdentitySampler",
    "NaiveIdentitySampler",
    "SetReWeightSampler",
    "TrainingSampler",
-    "InferenceSampler"
+    "InferenceSampler",
+    "ImbalancedDatasetSampler",
 ]
--- a/fastreid/data/samplers/imbalance_sampler.py
+++ b/fastreid/data/samplers/imbalance_sampler.py
@ -0,0 +1,67 @@
+# encoding: utf-8
+"""
+@author:  xingyu liao
+@contact: sherlockliao01@gmail.com
+"""
+
+# based on:
+# https://github.com/ufoym/imbalanced-dataset-sampler/blob/master/torchsampler/imbalanced.py
+
+
+import itertools
+from typing import Optional, List, Callable
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+from fastreid.utils import comm
+
+
+class ImbalancedDatasetSampler(Sampler):
+    """Samples elements randomly from a given list of indices for imbalanced dataset
+    Arguments:
+        data_source: a list of data items
+        size: number of samples to draw
+    """
+
+    def __init__(self, data_source: List, size: int = None, seed: Optional[int] = None,
+                 callback_get_label: Callable = None):
+        self.data_source = data_source
+        # consider all elements in the dataset
+        self.indices = list(range(len(data_source)))
+        # if num_samples is not provided, draw `len(indices)` samples in each iteration
+        self._size = len(self.indices) if size is None else size
+        self.callback_get_label = callback_get_label
+
+        # distribution of classes in the dataset
+        label_to_count = {}
+        for idx in self.indices:
+            label = self._get_label(data_source, idx)
+            label_to_count[label] = label_to_count.get(label, 0) + 1
+
+        # weight for each sample
+        weights = [1.0 / label_to_count[self._get_label(data_source, idx)] for idx in self.indices]
+        self.weights = torch.DoubleTensor(weights)
+
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+    def _get_label(self, dataset, idx):
+        if self.callback_get_label:
+            return self.callback_get_label(dataset, idx)
+        else:
+            return dataset[idx][1]
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        np.random.seed(self._seed)
+        while True:
+            for i in torch.multinomial(self.weights, self._size, replacement=True):
+                yield self.indices[i]
--- a/fastreid/data/samplers/triplet_sampler.py
+++ b/fastreid/data/samplers/triplet_sampler.py
@ -7,7 +7,7 @@
 import copy
 import itertools
 from collections import defaultdict
-from typing import Optional
+from typing import Optional, List

 import numpy as np
 from torch.utils.data.sampler import Sampler
@ -39,7 +39,7 @@ def reorder_index(batch_indices, world_size):


 class BalancedIdentitySampler(Sampler):
-    def __init__(self, data_source: str, mini_batch_size: int, num_instances: int, seed: Optional[int] = None):
+    def __init__(self, data_source: List, mini_batch_size: int, num_instances: int, seed: Optional[int] = None):
        self.data_source = data_source
        self.num_instances = num_instances
        self.num_pids_per_batch = mini_batch_size // self.num_instances
--- a/fastreid/engine/defaults.py
+++ b/fastreid/engine/defaults.py
@ -149,7 +149,7 @@ class DefaultPredictor:
        Returns:
            predictions (torch.tensor): the output features of the model
        """
-        inputs = {"images": image}
+        inputs = {"images": image.to(self.model.device)}
        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
            predictions = self.model(inputs)
            # Normalize feature to compute cosine distance
--- a/fastreid/evaluation/clas_evaluator.py
+++ b/fastreid/evaluation/clas_evaluator.py
@ -38,7 +38,6 @@ class ClasEvaluator(DatasetEvaluator):
    def __init__(self, cfg, output_dir=None):
        self.cfg = cfg
        self._output_dir = output_dir
-
        self._cpu_device = torch.device('cpu')

        self._predictions = []
@ -49,7 +48,7 @@ class ClasEvaluator(DatasetEvaluator):
    def process(self, inputs, outputs):
        predictions = {
            "logits": outputs.to(self._cpu_device, torch.float32),
-            "labels": inputs["targets"],
+            "labels": inputs["targets"].to(self._cpu_device),
        }
        self._predictions.append(predictions)

--- a/fastreid/modeling/heads/clas_head.py
+++ b/fastreid/modeling/heads/clas_head.py
@ -25,12 +25,12 @@ class ClasHead(EmbeddingHead):
            logits = F.linear(F.normalize(neck_feat), F.normalize(self.weight))

        # Evaluation
-        if not self.training: return logits * self.cls_layer.s
+        if not self.training: return logits.mul_(self.cls_layer.s)

        cls_outputs = self.cls_layer(logits, targets)

        return {
            "cls_outputs": cls_outputs,
-            "pred_class_logits": logits * self.cls_layer.s,
+            "pred_class_logits": logits.mul_(self.cls_layer.s),
            "features": neck_feat,
        }
--- a/fastreid/modeling/heads/embedding_head.py
+++ b/fastreid/modeling/heads/embedding_head.py
@ -142,6 +142,6 @@ class EmbeddingHead(nn.Module):

        return {
            "cls_outputs": cls_outputs,
-            "pred_class_logits": logits * self.cls_layer.s,
+            "pred_class_logits": logits.mul(self.cls_layer.s),
            "features": feat,
        }