10 #include "Clustering.h"
11 #include "AuxIndexStructures.h"
19 #include "FaissAssert.h"
20 #include "IndexFlat.h"
31 frozen_centroids(false),
32 min_points_per_centroid(39),
33 max_points_per_centroid(256),
47 static double imbalance_factor (
int n,
int k,
long *assign) {
48 std::vector<int> hist(k, 0);
49 for (
int i = 0; i < n; i++)
52 double tot = 0, uf = 0;
54 for (
int i = 0 ; i < k ; i++) {
56 uf += hist[i] * (double) hist[i];
58 uf = uf * k / (tot * tot);
71 for (
size_t i = 0; i <
centroids.size(); i++)
78 FAISS_THROW_IF_NOT_FMT (nx >= k,
79 "Number of training points (%ld) should be at least "
80 "as large as number of clusters (%ld)", nx, k);
86 for (
size_t i = 0; i < nx *
d; i++) {
87 FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
88 "input contains NaN's or Inf's");
91 const float *x = x_in;
96 printf(
"Sampling a subset of %ld / %ld for training\n",
97 k * max_points_per_centroid, nx);
98 std::vector<int> perm (nx);
99 rand_perm (perm.data (), nx,
seed);
101 float * x_new =
new float [nx *
d];
102 for (idx_t i = 0; i < nx; i++)
103 memcpy (x_new + i * d, x + perm[i] * d,
sizeof(x_new[0]) *
d);
108 "WARNING clustering %ld points to %ld centroids: "
109 "please provide at least %ld training points\n",
110 nx, k, idx_t(k) * min_points_per_centroid);
116 printf(
"Number of training points (%ld) same as number of "
117 "clusters, just copying\n", nx);
121 memcpy (
centroids.data(), x_in,
sizeof (*x_in) * d *
k);
129 printf(
"Clustering %d points in %ldD to %ld clusters, "
130 "redo %d times, %d iterations\n",
133 idx_t * assign =
new idx_t[nx];
135 float * dis =
new float[nx];
139 float best_err = HUGE_VALF;
140 std::vector<float> best_obj;
141 std::vector<float> best_centroids;
145 FAISS_THROW_IF_NOT_MSG (
147 "size of provided input centroids not a multiple of dimension");
149 size_t n_input_centroids =
centroids.size() /
d;
151 if (verbose && n_input_centroids > 0) {
152 printf (
" Using %zd centroids provided as input (%sfrozen)\n",
156 double t_search_tot = 0;
158 printf(
" Preprocessing in %.2f s\n",
163 for (
int redo = 0; redo <
nredo; redo++) {
165 if (verbose && nredo > 1) {
166 printf(
"Outer iteration %d / %d\n", redo, nredo);
171 std::vector<int> perm (nx);
173 rand_perm (perm.data(), nx,
seed + 1 + redo * 15486557L);
174 for (
int i = n_input_centroids; i <
k ; i++)
175 memcpy (&
centroids[i * d], x + perm[i] * d,
190 for (
int i = 0; i <
niter; i++) {
192 index.
search (nx, x, 1, dis, assign);
197 for (
int j = 0; j < nx; j++)
206 printf (
" Iteration %d (%.2f s, search %.2f s): "
207 "objective=%g imbalance=%.3f nsplit=%d \r",
210 err, imbalance_factor (nx, k, assign),
221 assert (index.
ntotal == 0);
225 if (verbose) printf(
"\n");
227 if (err < best_err) {
229 printf (
"Objective improved: keep new clusters\n");
241 index.
add(k, best_centroids.data());
251 clus.verbose = d * n * k > (1L << 30);
254 clus.
train (n, x, index);
255 memcpy(centroids, clus.
centroids.data(),
sizeof(*centroids) * d * k);
256 return clus.
obj.back();
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
int niter
clustering iterations
int nredo
redo clustering this many times and keep best
ClusteringParameters()
sets reasonable defaults
virtual void reset()=0
removes all elements from the database.
Clustering(int d, int k)
the only mandatory parameters are k and d
virtual void train(idx_t n, const float *x)
int seed
seed for the random number generator
bool frozen_centroids
use the centroids provided as input and do not change them during iterations
int min_points_per_centroid
otherwise you get a warning
virtual void add(idx_t n, const float *x)=0
void post_process_centroids()
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
idx_t ntotal
total nb of indexed vectors
double getmillisecs()
ms elapsed since some arbitrary epoch
std::vector< float > centroids
centroids (k * d)
size_t d
dimension of the vectors
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
bool update_index
update index after each iteration?
bool int_centroids
round centroids coordinates to integer
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
bool is_trained
set if the Index does not require training, or if training is done already
bool spherical
do we want normalized centroids?
int max_points_per_centroid
to limit size of dataset