Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/Clustering.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #ifndef FAISS_CLUSTERING_H
13 #define FAISS_CLUSTERING_H
14 #include "Index.h"
15 
16 #include <vector>
17 
18 namespace faiss {
19 
20 
21 /** Class for the clustering parameters. Can be passed to the
22  * constructor of the Clustering object.
23  */
25  int niter; ///< clustering iterations
26  int nredo; ///< redo clustering this many times and keep best
27 
28  bool verbose;
29  bool spherical; ///< do we want normalized centroids?
30  bool update_index; ///< update index after each iteration?
31 
32  int min_points_per_centroid; ///< otherwise you get a warning
33  int max_points_per_centroid; ///< to limit size of dataset
34 
35  int seed; ///< seed for the random number generator
36 
37  /// sets reasonable defaults
39 };
40 
41 
42 /** clustering based on assignment - centroid update iterations
43  *
44  * The clustering is based on an Index object that assigns training
45  * points to the centroids. Therefore, at each iteration the centroids
46  * are added to the index.
47  *
48  * On output, the centoids table is set to the latest version
49  * of the centroids and they are also added to the index. If the
50  * centroids table it is not empty on input, it is also used for
51  * initialization.
52  *
53  * To do several clusterings, just call train() several times on
54  * different training sets, clearing the centroid table in between.
55  */
57  typedef Index::idx_t idx_t;
58  size_t d; ///< dimension of the vectors
59  size_t k; ///< nb of centroids
60 
61  /// centroids (k * d)
62  std::vector<float> centroids;
63 
64  /// objective values (sum of distances reported by index) over
65  /// iterations
66  std::vector<float> obj;
67 
68  /// the only mandatory parameters are k and d
69  Clustering (int d, int k);
70  Clustering (int d, int k, const ClusteringParameters &cp);
71 
72  /// Index is used during the assignment stage
73  virtual void train (idx_t n, const float * x, faiss::Index & index);
74 
75  virtual ~Clustering() {}
76 };
77 
78 
79 /** simplified interface
80  *
81  * @param d dimension of the data
82  * @param n nb of training vectors
83  * @param k nb of output centroids
84  * @param x training set (size n * d)
85  * @param centroids output centroids (size k * d)
86  * @return final quantization error
87  */
88 float kmeans_clustering (size_t d, size_t n, size_t k,
89  const float *x,
90  float *centroids);
91 
92 
93 
94 }
95 
96 
97 #endif
int niter
clustering iterations
Definition: Clustering.h:25
int nredo
redo clustering this many times and keep best
Definition: Clustering.h:26
ClusteringParameters()
sets reasonable defaults
Definition: Clustering.cpp:27
Clustering(int d, int k)
the only mandatory parameters are k and d
Definition: Clustering.cpp:39
size_t k
nb of centroids
Definition: Clustering.h:59
int seed
seed for the random number generator
Definition: Clustering.h:35
int min_points_per_centroid
otherwise you get a warning
Definition: Clustering.h:32
std::vector< float > obj
Definition: Clustering.h:66
long idx_t
all indices are this type
Definition: Index.h:62
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
Definition: Clustering.cpp:204
std::vector< float > centroids
centroids (k * d)
Definition: Clustering.h:62
size_t d
dimension of the vectors
Definition: Clustering.h:58
bool update_index
update index after each iteration?
Definition: Clustering.h:30
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:66
bool spherical
do we want normalized centroids?
Definition: Clustering.h:29
int max_points_per_centroid
to limit size of dataset
Definition: Clustering.h:33