mmselfsup/openselfsup/third_party/clustering.py

312 lines
9.1 KiB
Python
Raw Normal View History

2020-06-16 00:05:18 +08:00
# This file is modified from
# https://github.com/facebookresearch/deepcluster/blob/master/clustering.py
import time
import numpy as np
2020-09-09 14:41:56 +08:00
try:
import faiss
except:
2020-09-09 14:55:36 +08:00
pass
2020-06-16 00:05:18 +08:00
import torch
__all__ = ['Kmeans', 'PIC']
def preprocess_features(npdata, pca):
"""Preprocess an array of features.
Args:
npdata (np.array N * ndim): features to preprocess
pca (int): dim of output
Returns:
np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
"""
_, ndim = npdata.shape
#npdata = npdata.astype('float32')
assert npdata.dtype == np.float32
if np.any(np.isnan(npdata)):
raise Exception("nan occurs")
if pca != -1:
print("\nPCA from dim {} to dim {}".format(ndim, pca))
mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
mat.train(npdata)
assert mat.is_trained
npdata = mat.apply_py(npdata)
if np.any(np.isnan(npdata)):
percent = np.isnan(npdata).sum().item() / float(np.size(npdata)) * 100
if percent > 0.1:
raise Exception(
"More than 0.1% nan occurs after pca, percent: {}%".format(
percent))
else:
npdata[np.isnan(npdata)] = 0.
# L2 normalization
row_sums = np.linalg.norm(npdata, axis=1)
npdata = npdata / (row_sums[:, np.newaxis] + 1e-10)
return npdata
def make_graph(xb, nnn):
"""Builds a graph of nearest neighbors.
Args:
xb (np.array): data
nnn (int): number of nearest neighbors
Returns:
list: for each data the list of ids to its nnn nearest neighbors
list: for each data the list of distances to its nnn NN
"""
N, dim = xb.shape
# we need only a StandardGpuResources per GPU
res = faiss.StandardGpuResources()
# L2
flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = int(torch.cuda.device_count()) - 1
index = faiss.GpuIndexFlatL2(res, dim, flat_config)
index.add(xb)
D, I = index.search(xb, nnn + 1)
return I, D
def run_kmeans(x, nmb_clusters, verbose=False):
"""Runs kmeans on 1 GPU.
Args:
x: data
nmb_clusters (int): number of clusters
Returns:
list: ids of data in each cluster
"""
n_data, d = x.shape
# faiss implementation of k-means
clus = faiss.Clustering(d, nmb_clusters)
# Change faiss seed at each k-means so that the randomly picked
# initialization centroids do not correspond to the same feature ids
# from an epoch to another.
clus.seed = np.random.randint(1234)
clus.niter = 20
clus.max_points_per_centroid = 10000000
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.useFloat16 = False
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)
# perform the training
clus.train(x, index)
_, I = index.search(x, 1)
losses = faiss.vector_to_array(clus.obj)
if verbose:
print('k-means loss evolution: {0}'.format(losses))
return [int(n[0]) for n in I], losses[-1]
def arrange_clustering(images_lists):
pseudolabels = []
image_indexes = []
for cluster, images in enumerate(images_lists):
image_indexes.extend(images)
pseudolabels.extend([cluster] * len(images))
indexes = np.argsort(image_indexes)
return np.asarray(pseudolabels)[indexes]
class Kmeans:
def __init__(self, k, pca_dim=256):
self.k = k
self.pca_dim = pca_dim
def cluster(self, feat, verbose=False):
"""Performs k-means clustering.
Args:
x_data (np.array N * dim): data to cluster
"""
end = time.time()
# PCA-reducing, whitening and L2-normalization
xb = preprocess_features(feat, self.pca_dim)
# cluster the data
I, loss = run_kmeans(xb, self.k, verbose)
self.labels = np.array(I)
if verbose:
print('k-means time: {0:.0f} s'.format(time.time() - end))
return loss
def make_adjacencyW(I, D, sigma):
"""Create adjacency matrix with a Gaussian kernel.
Args:
I (numpy array): for each vertex the ids to its nnn linked vertices
+ first column of identity.
D (numpy array): for each data the l2 distances to its nnn linked vertices
+ first column of zeros.
sigma (float): Bandwith of the Gaussian kernel.
Returns:
csr_matrix: affinity matrix of the graph.
"""
V, k = I.shape
k = k - 1
indices = np.reshape(np.delete(I, 0, 1), (1, -1))
indptr = np.multiply(k, np.arange(V + 1))
def exp_ker(d):
return np.exp(-d / sigma**2)
exp_ker = np.vectorize(exp_ker)
res_D = exp_ker(D)
data = np.reshape(np.delete(res_D, 0, 1), (1, -1))
adj_matrix = csr_matrix((data[0], indices[0], indptr), shape=(V, V))
return adj_matrix
def run_pic(I, D, sigma, alpha):
"""Run PIC algorithm"""
a = make_adjacencyW(I, D, sigma)
graph = a + a.transpose()
cgraph = graph
nim = graph.shape[0]
W = graph
t0 = time.time()
v0 = np.ones(nim) / nim
# power iterations
v = v0.astype('float32')
t0 = time.time()
dt = 0
for i in range(200):
vnext = np.zeros(nim, dtype='float32')
vnext = vnext + W.transpose().dot(v)
vnext = alpha * vnext + (1 - alpha) / nim
# L1 normalize
vnext /= vnext.sum()
v = vnext
if (i == 200 - 1):
clust = find_maxima_cluster(W, v)
return [int(i) for i in clust]
def find_maxima_cluster(W, v):
n, m = W.shape
assert (n == m)
assign = np.zeros(n)
# for each node
pointers = list(range(n))
for i in range(n):
best_vi = 0
l0 = W.indptr[i]
l1 = W.indptr[i + 1]
for l in range(l0, l1):
j = W.indices[l]
vi = W.data[l] * (v[j] - v[i])
if vi > best_vi:
best_vi = vi
pointers[i] = j
n_clus = 0
cluster_ids = -1 * np.ones(n)
for i in range(n):
if pointers[i] == i:
cluster_ids[i] = n_clus
n_clus = n_clus + 1
for i in range(n):
# go from pointers to pointers starting from i until reached a local optim
current_node = i
while pointers[current_node] != current_node:
current_node = pointers[current_node]
assign[i] = cluster_ids[current_node]
assert (assign[i] >= 0)
return assign
class PIC():
"""Class to perform Power Iteration Clustering on a graph of nearest neighbors.
Args:
args: for consistency with k-means init
sigma (float): bandwith of the Gaussian kernel (default 0.2)
nnn (int): number of nearest neighbors (default 5)
alpha (float): parameter in PIC (default 0.001)
distribute_singletons (bool): If True, reassign each singleton to
the cluster of its closest non
singleton nearest neighbors (up to nnn
nearest neighbors).
Attributes:
images_lists (list of list): for each cluster, the list of image indexes
belonging to this cluster
"""
def __init__(self,
args=None,
sigma=0.2,
nnn=5,
alpha=0.001,
distribute_singletons=True,
pca_dim=256):
self.sigma = sigma
self.alpha = alpha
self.nnn = nnn
self.distribute_singletons = distribute_singletons
self.pca_dim = pca_dim
def cluster(self, data, verbose=False):
end = time.time()
# preprocess the data
xb = preprocess_features(data, self.pca_dim)
# construct nnn graph
I, D = make_graph(xb, self.nnn)
# run PIC
clust = run_pic(I, D, self.sigma, self.alpha)
images_lists = {}
for h in set(clust):
images_lists[h] = []
for data, c in enumerate(clust):
images_lists[c].append(data)
# allocate singletons to clusters of their closest NN not singleton
if self.distribute_singletons:
clust_NN = {}
for i in images_lists:
# if singleton
if len(images_lists[i]) == 1:
s = images_lists[i][0]
# for NN
for n in I[s, 1:]:
# if NN is not a singleton
if not len(images_lists[clust[n]]) == 1:
clust_NN[s] = n
break
for s in clust_NN:
del images_lists[clust[s]]
clust[s] = clust[clust_NN[s]]
images_lists[clust[s]].append(s)
self.images_lists = []
self.labels = -1 * np.ones((data.shape[0], ), dtype=np.int)
for i, c in enumerate(images_lists):
self.images_lists.append(images_lists[c])
self.labels[images_lists[c]] = i
assert np.all(self.labels != -1)
if verbose:
print('pic time: {0:.0f} s'.format(time.time() - end))
return 0