312 lines
9.1 KiB
Python
312 lines
9.1 KiB
Python
# This file is modified from
|
|
# https://github.com/facebookresearch/deepcluster/blob/master/clustering.py
|
|
|
|
import time
|
|
import numpy as np
|
|
try:
|
|
import faiss
|
|
except:
|
|
pass
|
|
import torch
|
|
|
|
__all__ = ['Kmeans', 'PIC']
|
|
|
|
|
|
def preprocess_features(npdata, pca):
|
|
"""Preprocess an array of features.
|
|
Args:
|
|
npdata (np.array N * ndim): features to preprocess
|
|
pca (int): dim of output
|
|
Returns:
|
|
np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
|
|
"""
|
|
_, ndim = npdata.shape
|
|
#npdata = npdata.astype('float32')
|
|
assert npdata.dtype == np.float32
|
|
|
|
if np.any(np.isnan(npdata)):
|
|
raise Exception("nan occurs")
|
|
if pca != -1:
|
|
print("\nPCA from dim {} to dim {}".format(ndim, pca))
|
|
mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5)
|
|
mat.train(npdata)
|
|
assert mat.is_trained
|
|
npdata = mat.apply_py(npdata)
|
|
if np.any(np.isnan(npdata)):
|
|
percent = np.isnan(npdata).sum().item() / float(np.size(npdata)) * 100
|
|
if percent > 0.1:
|
|
raise Exception(
|
|
"More than 0.1% nan occurs after pca, percent: {}%".format(
|
|
percent))
|
|
else:
|
|
npdata[np.isnan(npdata)] = 0.
|
|
# L2 normalization
|
|
row_sums = np.linalg.norm(npdata, axis=1)
|
|
|
|
npdata = npdata / (row_sums[:, np.newaxis] + 1e-10)
|
|
|
|
return npdata
|
|
|
|
|
|
def make_graph(xb, nnn):
|
|
"""Builds a graph of nearest neighbors.
|
|
Args:
|
|
xb (np.array): data
|
|
nnn (int): number of nearest neighbors
|
|
Returns:
|
|
list: for each data the list of ids to its nnn nearest neighbors
|
|
list: for each data the list of distances to its nnn NN
|
|
"""
|
|
N, dim = xb.shape
|
|
|
|
# we need only a StandardGpuResources per GPU
|
|
res = faiss.StandardGpuResources()
|
|
|
|
# L2
|
|
flat_config = faiss.GpuIndexFlatConfig()
|
|
flat_config.device = int(torch.cuda.device_count()) - 1
|
|
index = faiss.GpuIndexFlatL2(res, dim, flat_config)
|
|
index.add(xb)
|
|
D, I = index.search(xb, nnn + 1)
|
|
return I, D
|
|
|
|
|
|
def run_kmeans(x, nmb_clusters, verbose=False):
|
|
"""Runs kmeans on 1 GPU.
|
|
Args:
|
|
x: data
|
|
nmb_clusters (int): number of clusters
|
|
Returns:
|
|
list: ids of data in each cluster
|
|
"""
|
|
n_data, d = x.shape
|
|
|
|
# faiss implementation of k-means
|
|
clus = faiss.Clustering(d, nmb_clusters)
|
|
|
|
# Change faiss seed at each k-means so that the randomly picked
|
|
# initialization centroids do not correspond to the same feature ids
|
|
# from an epoch to another.
|
|
clus.seed = np.random.randint(1234)
|
|
|
|
clus.niter = 20
|
|
clus.max_points_per_centroid = 10000000
|
|
res = faiss.StandardGpuResources()
|
|
flat_config = faiss.GpuIndexFlatConfig()
|
|
flat_config.useFloat16 = False
|
|
flat_config.device = 0
|
|
index = faiss.GpuIndexFlatL2(res, d, flat_config)
|
|
|
|
# perform the training
|
|
clus.train(x, index)
|
|
_, I = index.search(x, 1)
|
|
losses = faiss.vector_to_array(clus.obj)
|
|
if verbose:
|
|
print('k-means loss evolution: {0}'.format(losses))
|
|
|
|
return [int(n[0]) for n in I], losses[-1]
|
|
|
|
|
|
def arrange_clustering(images_lists):
|
|
pseudolabels = []
|
|
image_indexes = []
|
|
for cluster, images in enumerate(images_lists):
|
|
image_indexes.extend(images)
|
|
pseudolabels.extend([cluster] * len(images))
|
|
indexes = np.argsort(image_indexes)
|
|
return np.asarray(pseudolabels)[indexes]
|
|
|
|
|
|
class Kmeans:
|
|
|
|
def __init__(self, k, pca_dim=256):
|
|
self.k = k
|
|
self.pca_dim = pca_dim
|
|
|
|
def cluster(self, feat, verbose=False):
|
|
"""Performs k-means clustering.
|
|
Args:
|
|
x_data (np.array N * dim): data to cluster
|
|
"""
|
|
end = time.time()
|
|
|
|
# PCA-reducing, whitening and L2-normalization
|
|
xb = preprocess_features(feat, self.pca_dim)
|
|
|
|
# cluster the data
|
|
I, loss = run_kmeans(xb, self.k, verbose)
|
|
self.labels = np.array(I)
|
|
if verbose:
|
|
print('k-means time: {0:.0f} s'.format(time.time() - end))
|
|
|
|
return loss
|
|
|
|
|
|
def make_adjacencyW(I, D, sigma):
|
|
"""Create adjacency matrix with a Gaussian kernel.
|
|
Args:
|
|
I (numpy array): for each vertex the ids to its nnn linked vertices
|
|
+ first column of identity.
|
|
D (numpy array): for each data the l2 distances to its nnn linked vertices
|
|
+ first column of zeros.
|
|
sigma (float): Bandwith of the Gaussian kernel.
|
|
|
|
Returns:
|
|
csr_matrix: affinity matrix of the graph.
|
|
"""
|
|
V, k = I.shape
|
|
k = k - 1
|
|
indices = np.reshape(np.delete(I, 0, 1), (1, -1))
|
|
indptr = np.multiply(k, np.arange(V + 1))
|
|
|
|
def exp_ker(d):
|
|
return np.exp(-d / sigma**2)
|
|
|
|
exp_ker = np.vectorize(exp_ker)
|
|
res_D = exp_ker(D)
|
|
data = np.reshape(np.delete(res_D, 0, 1), (1, -1))
|
|
adj_matrix = csr_matrix((data[0], indices[0], indptr), shape=(V, V))
|
|
return adj_matrix
|
|
|
|
|
|
def run_pic(I, D, sigma, alpha):
|
|
"""Run PIC algorithm"""
|
|
a = make_adjacencyW(I, D, sigma)
|
|
graph = a + a.transpose()
|
|
cgraph = graph
|
|
nim = graph.shape[0]
|
|
|
|
W = graph
|
|
t0 = time.time()
|
|
|
|
v0 = np.ones(nim) / nim
|
|
|
|
# power iterations
|
|
v = v0.astype('float32')
|
|
|
|
t0 = time.time()
|
|
dt = 0
|
|
for i in range(200):
|
|
vnext = np.zeros(nim, dtype='float32')
|
|
|
|
vnext = vnext + W.transpose().dot(v)
|
|
|
|
vnext = alpha * vnext + (1 - alpha) / nim
|
|
# L1 normalize
|
|
vnext /= vnext.sum()
|
|
v = vnext
|
|
|
|
if (i == 200 - 1):
|
|
clust = find_maxima_cluster(W, v)
|
|
|
|
return [int(i) for i in clust]
|
|
|
|
|
|
def find_maxima_cluster(W, v):
|
|
n, m = W.shape
|
|
assert (n == m)
|
|
assign = np.zeros(n)
|
|
# for each node
|
|
pointers = list(range(n))
|
|
for i in range(n):
|
|
best_vi = 0
|
|
l0 = W.indptr[i]
|
|
l1 = W.indptr[i + 1]
|
|
for l in range(l0, l1):
|
|
j = W.indices[l]
|
|
vi = W.data[l] * (v[j] - v[i])
|
|
if vi > best_vi:
|
|
best_vi = vi
|
|
pointers[i] = j
|
|
n_clus = 0
|
|
cluster_ids = -1 * np.ones(n)
|
|
for i in range(n):
|
|
if pointers[i] == i:
|
|
cluster_ids[i] = n_clus
|
|
n_clus = n_clus + 1
|
|
for i in range(n):
|
|
# go from pointers to pointers starting from i until reached a local optim
|
|
current_node = i
|
|
while pointers[current_node] != current_node:
|
|
current_node = pointers[current_node]
|
|
|
|
assign[i] = cluster_ids[current_node]
|
|
assert (assign[i] >= 0)
|
|
return assign
|
|
|
|
|
|
class PIC():
|
|
"""Class to perform Power Iteration Clustering on a graph of nearest neighbors.
|
|
Args:
|
|
args: for consistency with k-means init
|
|
sigma (float): bandwith of the Gaussian kernel (default 0.2)
|
|
nnn (int): number of nearest neighbors (default 5)
|
|
alpha (float): parameter in PIC (default 0.001)
|
|
distribute_singletons (bool): If True, reassign each singleton to
|
|
the cluster of its closest non
|
|
singleton nearest neighbors (up to nnn
|
|
nearest neighbors).
|
|
Attributes:
|
|
images_lists (list of list): for each cluster, the list of image indexes
|
|
belonging to this cluster
|
|
"""
|
|
|
|
def __init__(self,
|
|
args=None,
|
|
sigma=0.2,
|
|
nnn=5,
|
|
alpha=0.001,
|
|
distribute_singletons=True,
|
|
pca_dim=256):
|
|
self.sigma = sigma
|
|
self.alpha = alpha
|
|
self.nnn = nnn
|
|
self.distribute_singletons = distribute_singletons
|
|
self.pca_dim = pca_dim
|
|
|
|
def cluster(self, data, verbose=False):
|
|
end = time.time()
|
|
|
|
# preprocess the data
|
|
xb = preprocess_features(data, self.pca_dim)
|
|
|
|
# construct nnn graph
|
|
I, D = make_graph(xb, self.nnn)
|
|
|
|
# run PIC
|
|
clust = run_pic(I, D, self.sigma, self.alpha)
|
|
images_lists = {}
|
|
for h in set(clust):
|
|
images_lists[h] = []
|
|
for data, c in enumerate(clust):
|
|
images_lists[c].append(data)
|
|
|
|
# allocate singletons to clusters of their closest NN not singleton
|
|
if self.distribute_singletons:
|
|
clust_NN = {}
|
|
for i in images_lists:
|
|
# if singleton
|
|
if len(images_lists[i]) == 1:
|
|
s = images_lists[i][0]
|
|
# for NN
|
|
for n in I[s, 1:]:
|
|
# if NN is not a singleton
|
|
if not len(images_lists[clust[n]]) == 1:
|
|
clust_NN[s] = n
|
|
break
|
|
for s in clust_NN:
|
|
del images_lists[clust[s]]
|
|
clust[s] = clust[clust_NN[s]]
|
|
images_lists[clust[s]].append(s)
|
|
|
|
self.images_lists = []
|
|
self.labels = -1 * np.ones((data.shape[0], ), dtype=np.int)
|
|
for i, c in enumerate(images_lists):
|
|
self.images_lists.append(images_lists[c])
|
|
self.labels[images_lists[c]] = i
|
|
assert np.all(self.labels != -1)
|
|
|
|
if verbose:
|
|
print('pic time: {0:.0f} s'.format(time.time() - end))
|
|
return 0
|