Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/Clustering.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #ifndef FAISS_CLUSTERING_H
11 #define FAISS_CLUSTERING_H
12 #include "Index.h"
13 
14 #include <vector>
15 
16 namespace faiss {
17 
18 
19 /** Class for the clustering parameters. Can be passed to the
20  * constructor of the Clustering object.
21  */
23  int niter; ///< clustering iterations
24  int nredo; ///< redo clustering this many times and keep best
25 
26  bool verbose;
27  bool spherical; ///< do we want normalized centroids?
28  bool int_centroids; ///< round centroids coordinates to integer
29  bool update_index; ///< update index after each iteration?
30  bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
31 
32  int min_points_per_centroid; ///< otherwise you get a warning
33  int max_points_per_centroid; ///< to limit size of dataset
34 
35  int seed; ///< seed for the random number generator
36 
37  /// sets reasonable defaults
39 };
40 
41 
42 /** clustering based on assignment - centroid update iterations
43  *
44  * The clustering is based on an Index object that assigns training
45  * points to the centroids. Therefore, at each iteration the centroids
46  * are added to the index.
47  *
48  * On output, the centoids table is set to the latest version
49  * of the centroids and they are also added to the index. If the
50  * centroids table it is not empty on input, it is also used for
51  * initialization.
52  *
53  * To do several clusterings, just call train() several times on
54  * different training sets, clearing the centroid table in between.
55  */
57  typedef Index::idx_t idx_t;
58  size_t d; ///< dimension of the vectors
59  size_t k; ///< nb of centroids
60 
61  /// centroids (k * d)
62  std::vector<float> centroids;
63 
64  /// objective values (sum of distances reported by index) over
65  /// iterations
66  std::vector<float> obj;
67 
68  /// the only mandatory parameters are k and d
69  Clustering (int d, int k);
70  Clustering (int d, int k, const ClusteringParameters &cp);
71 
72  /// Index is used during the assignment stage
73  virtual void train (idx_t n, const float * x, faiss::Index & index);
74 
75  /// Post-process the centroids after each centroid update.
76  /// includes optional L2 normalization and nearest integer rounding
77  void post_process_centroids ();
78 
79  virtual ~Clustering() {}
80 };
81 
82 
83 /** simplified interface
84  *
85  * @param d dimension of the data
86  * @param n nb of training vectors
87  * @param k nb of output centroids
88  * @param x training set (size n * d)
89  * @param centroids output centroids (size k * d)
90  * @return final quantization error
91  */
92 float kmeans_clustering (size_t d, size_t n, size_t k,
93  const float *x,
94  float *centroids);
95 
96 
97 
98 }
99 
100 
101 #endif
int niter
clustering iterations
Definition: Clustering.h:23
int nredo
redo clustering this many times and keep best
Definition: Clustering.h:24
ClusteringParameters()
sets reasonable defaults
Definition: Clustering.cpp:24
Clustering(int d, int k)
the only mandatory parameters are k and d
Definition: Clustering.cpp:39
size_t k
nb of centroids
Definition: Clustering.h:59
int seed
seed for the random number generator
Definition: Clustering.h:35
bool frozen_centroids
use the centroids provided as input and do not change them during iterations
Definition: Clustering.h:30
long idx_t
all indices are this type
Definition: Index.h:62
int min_points_per_centroid
otherwise you get a warning
Definition: Clustering.h:32
void post_process_centroids()
Definition: Clustering.cpp:63
std::vector< float > obj
Definition: Clustering.h:66
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
Definition: Clustering.cpp:246
std::vector< float > centroids
centroids (k * d)
Definition: Clustering.h:62
size_t d
dimension of the vectors
Definition: Clustering.h:58
bool update_index
update index after each iteration?
Definition: Clustering.h:29
bool int_centroids
round centroids coordinates to integer
Definition: Clustering.h:28
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:77
bool spherical
do we want normalized centroids?
Definition: Clustering.h:27
int max_points_per_centroid
to limit size of dataset
Definition: Clustering.h:33