Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/Clustering.h
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 // -*- c++ -*-
12 
13 #ifndef FAISS_CLUSTERING_H
14 #define FAISS_CLUSTERING_H
15 #include "Index.h"
16 
17 #include <vector>
18 
19 namespace faiss {
20 
21 
22 /** Class for the clustering parameters. Can be passed to the
23  * constructor of the Clustering object.
24  */
26  int niter; ///< clustering iterations
27  int nredo; ///< redo clustering this many times and keep best
28 
29 
30  bool verbose;
31  bool spherical; ///< do we want normalized centroids?
32  bool update_index; ///< update index after each iteration?
33 
34  int min_points_per_centroid; ///< otherwise you get a warning
35  int max_points_per_centroid; ///< to limit size of dataset
36 
37  int seed; ///< seed for the random number generator
38 
39  /// sets reasonable defaults
41 };
42 
43 
44 /** clustering based on assignment - centroid update iterations
45  *
46  * The clustering is based on an Index object that assigns training
47  * points to the centroids. Therefore, at each iteration the centroids
48  * are added to the index.
49  *
50  * On output, the centoids table is set to the latest version
51  * of the centroids and they are also added to the index. If the
52  * centroids table it is not empty on input, it is also used for
53  * initialization.
54  *
55  * To do several clusterings, just call train() several times on
56  * different training sets, clearing the centroid table in between.
57  */
59  typedef Index::idx_t idx_t;
60  size_t d; ///< dimension of the vectors
61  size_t k; ///< nb of centroids
62 
63  /// centroids (k * d)
64  std::vector<float> centroids;
65 
66  /// objective values (sum of distances reported by index) over
67  /// iterations
68  std::vector<float> obj;
69 
70  /// the only mandatory parameters are k and d
71  Clustering (int d, int k);
72  Clustering (int d, int k, const ClusteringParameters &cp);
73 
74  /// Index is used during the assignment stage
75  virtual void train (idx_t n, const float * x, faiss::Index & index);
76 
77  virtual ~Clustering() {}
78 };
79 
80 
81 /** simplified interface
82  *
83  * @param d dimension of the data
84  * @param n nb of training vectors
85  * @param k nb of output centroids
86  * @param x training set (size n * d)
87  * @param centroids output centroids (size k * d)
88  * @return final quantization error
89  */
90 float kmeans_clustering (size_t d, size_t n, size_t k,
91  const float *x,
92  float *centroids);
93 
94 
95 
96 }
97 
98 
99 #endif
int niter
clustering iterations
Definition: Clustering.h:26
int nredo
redo clustering this many times and keep best
Definition: Clustering.h:27
ClusteringParameters()
sets reasonable defaults
Definition: Clustering.cpp:28
Clustering(int d, int k)
the only mandatory parameters are k and d
Definition: Clustering.cpp:40
size_t k
nb of centroids
Definition: Clustering.h:61
int seed
seed for the random number generator
Definition: Clustering.h:37
int min_points_per_centroid
otherwise you get a warning
Definition: Clustering.h:34
std::vector< float > obj
Definition: Clustering.h:68
long idx_t
all indices are this type
Definition: Index.h:64
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
Definition: Clustering.cpp:204
std::vector< float > centroids
centroids (k * d)
Definition: Clustering.h:64
size_t d
dimension of the vectors
Definition: Clustering.h:60
bool update_index
update index after each iteration?
Definition: Clustering.h:32
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:67
bool spherical
do we want normalized centroids?
Definition: Clustering.h:31
int max_points_per_centroid
to limit size of dataset
Definition: Clustering.h:35