Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/Clustering.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #ifndef FAISS_CLUSTERING_H
13 #define FAISS_CLUSTERING_H
14 #include "Index.h"
15 
16 #include <vector>
17 
18 namespace faiss {
19 
20 
21 /** Class for the clustering parameters. Can be passed to the
22  * constructor of the Clustering object.
23  */
25  int niter; ///< clustering iterations
26  int nredo; ///< redo clustering this many times and keep best
27 
28  bool verbose;
29  bool spherical; ///< do we want normalized centroids?
30  bool update_index; ///< update index after each iteration?
31  bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
32 
33  int min_points_per_centroid; ///< otherwise you get a warning
34  int max_points_per_centroid; ///< to limit size of dataset
35 
36  int seed; ///< seed for the random number generator
37 
38  /// sets reasonable defaults
40 };
41 
42 
43 /** clustering based on assignment - centroid update iterations
44  *
45  * The clustering is based on an Index object that assigns training
46  * points to the centroids. Therefore, at each iteration the centroids
47  * are added to the index.
48  *
49  * On output, the centoids table is set to the latest version
50  * of the centroids and they are also added to the index. If the
51  * centroids table it is not empty on input, it is also used for
52  * initialization.
53  *
54  * To do several clusterings, just call train() several times on
55  * different training sets, clearing the centroid table in between.
56  */
58  typedef Index::idx_t idx_t;
59  size_t d; ///< dimension of the vectors
60  size_t k; ///< nb of centroids
61 
62  /// centroids (k * d)
63  std::vector<float> centroids;
64 
65  /// objective values (sum of distances reported by index) over
66  /// iterations
67  std::vector<float> obj;
68 
69  /// the only mandatory parameters are k and d
70  Clustering (int d, int k);
71  Clustering (int d, int k, const ClusteringParameters &cp);
72 
73  /// Index is used during the assignment stage
74  virtual void train (idx_t n, const float * x, faiss::Index & index);
75 
76  virtual ~Clustering() {}
77 };
78 
79 
80 /** simplified interface
81  *
82  * @param d dimension of the data
83  * @param n nb of training vectors
84  * @param k nb of output centroids
85  * @param x training set (size n * d)
86  * @param centroids output centroids (size k * d)
87  * @return final quantization error
88  */
89 float kmeans_clustering (size_t d, size_t n, size_t k,
90  const float *x,
91  float *centroids);
92 
93 
94 
95 }
96 
97 
98 #endif
int niter
clustering iterations
Definition: Clustering.h:25
int nredo
redo clustering this many times and keep best
Definition: Clustering.h:26
ClusteringParameters()
sets reasonable defaults
Definition: Clustering.cpp:27
Clustering(int d, int k)
the only mandatory parameters are k and d
Definition: Clustering.cpp:40
size_t k
nb of centroids
Definition: Clustering.h:60
int seed
seed for the random number generator
Definition: Clustering.h:36
bool frozen_centroids
use the centroids provided as input and do not change them during iterations
Definition: Clustering.h:31
int min_points_per_centroid
otherwise you get a warning
Definition: Clustering.h:33
std::vector< float > obj
Definition: Clustering.h:67
long idx_t
all indices are this type
Definition: Index.h:62
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
Definition: Clustering.cpp:241
std::vector< float > centroids
centroids (k * d)
Definition: Clustering.h:63
size_t d
dimension of the vectors
Definition: Clustering.h:59
bool update_index
update index after each iteration?
Definition: Clustering.h:30
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:67
bool spherical
do we want normalized centroids?
Definition: Clustering.h:29
int max_points_per_centroid
to limit size of dataset
Definition: Clustering.h:34