Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 /** Copyright 2004-present Facebook. All Rights Reserved
10  * -*- c++ -*-
11  *
12  * A few utilitary functions for similarity search:
13  * - random generators
14  * - optimized exhaustive distance and knn search functions
15  * - some functions reimplemented from torch for speed
16  */
17 
18 #ifndef FAISS_utils_h
19 #define FAISS_utils_h
20 
21 #include <stdint.h>
22 // for the random data struct
23 #include <cstdlib>
24 
25 #include "Heap.h"
26 
27 
28 namespace faiss {
29 
30 
31 /**************************************************
32  * Get some stats about the system
33 **************************************************/
34 
35 
36 /// ms elapsed since some arbitrary epoch
37 double getmillisecs ();
38 
39 /// get current RSS usage in kB
40 size_t get_mem_usage_kb ();
41 
42 
43 /**************************************************
44  * Random data generation functions
45  **************************************************/
46 
47 /// random generator that can be used in multithreaded contexts
49 
50 #ifdef __linux__
51  char rand_state [8];
52  struct random_data rand_data;
53 #elif __APPLE__
54  unsigned rand_state;
55 #endif
56 
57  /// random 31-bit positive integer
58  int rand_int ();
59 
60  /// random long < 2 ^ 62
61  long rand_long ();
62 
63  /// generate random number between 0 and max-1
64  int rand_int (int max);
65 
66  /// between 0 and 1
67  float rand_float ();
68 
69 
70  double rand_double ();
71 
72  /// initialize
73  explicit RandomGenerator (long seed = 1234);
74 
75  /// default copy constructor messes up pointer in rand_data
76  RandomGenerator (const RandomGenerator & other);
77 
78 };
79 
80 /* Generate an array of uniform random floats / multi-threaded implementation */
81 void float_rand (float * x, size_t n, long seed);
82 void float_randn (float * x, size_t n, long seed);
83 void long_rand (long * x, size_t n, long seed);
84 void byte_rand (uint8_t * x, size_t n, long seed);
85 
86 /* random permutation */
87 void rand_perm (int * perm, size_t n, long seed);
88 
89 
90 
91  /*********************************************************
92  * Optimized distance/norm/inner prod computations
93  *********************************************************/
94 
95 
96 /// Squared L2 distance between two vectors
97 float fvec_L2sqr (
98  const float * x,
99  const float * y,
100  size_t d);
101 
102 /* SSE-implementation of inner product and L2 distance */
103 float fvec_inner_product (
104  const float * x,
105  const float * y,
106  size_t d);
107 
108 
109 /// a balanced assignment has a IF of 1
110 double imbalance_factor (int n, int k, const long *assign);
111 
112 /// same, takes a histogram as input
113 double imbalance_factor (int k, const int *hist);
114 
115 /** Compute pairwise distances between sets of vectors
116  *
117  * @param d dimension of the vectors
118  * @param nq nb of query vectors
119  * @param nb nb of database vectors
120  * @param xq query vectors (size nq * d)
121  * @param xb database vectros (size nb * d)
122  * @param dis output distances (size nq * nb)
123  * @param ldq,ldb, ldd strides for the matrices
124  */
125 void pairwise_L2sqr (long d,
126  long nq, const float *xq,
127  long nb, const float *xb,
128  float *dis,
129  long ldq = -1, long ldb = -1, long ldd = -1);
130 
131 
132 /* compute the inner product between nx vectors x and one y */
133 void fvec_inner_products_ny (
134  float * ip, /* output inner product */
135  const float * x,
136  const float * y,
137  size_t d, size_t ny);
138 
139 /* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
140 void fvec_L2sqr_ny (
141  float * __restrict dis,
142  const float * x,
143  const float * y,
144  size_t d, size_t ny);
145 
146 
147 /** squared norm of a vector */
148 float fvec_norm_L2sqr (const float * x,
149  size_t d);
150 
151 /** compute the L2 norms for a set of vectors
152  *
153  * @param ip output norms, size nx
154  * @param x set of vectors, size nx * d
155  */
156 void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
157 
158 /// same as fvec_norms_L2, but computes square norms
159 void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
160 
161 /* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
162 void fvec_renorm_L2 (size_t d, size_t nx, float * x);
163 
164 
165 /* This function exists because the Torch counterpart is extremly slow
166  (not multi-threaded + unexpected overhead even in single thread).
167  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
168 void inner_product_to_L2sqr (float * __restrict dis,
169  const float * nr1,
170  const float * nr2,
171  size_t n1, size_t n2);
172 
173 /***************************************************************************
174  * Compute a subset of distances
175  ***************************************************************************/
176 
177  /* compute the inner product between x and a subset y of ny vectors,
178  whose indices are given by idy. */
179 void fvec_inner_products_by_idx (
180  float * __restrict ip,
181  const float * x,
182  const float * y,
183  const long * __restrict ids,
184  size_t d, size_t nx, size_t ny);
185 
186 /* same but for a subset in y indexed by idsy (ny vectors in total) */
187 void fvec_L2sqr_by_idx (
188  float * __restrict dis,
189  const float * x,
190  const float * y,
191  const long * __restrict ids, /* ids of y vecs */
192  size_t d, size_t nx, size_t ny);
193 
194 /***************************************************************************
195  * KNN functions
196  ***************************************************************************/
197 
198 // threshold on nx above which we switch to BLAS to compute distances
199 extern int distance_compute_blas_threshold;
200 
201 /** Return the k nearest neighors of each of the nx vectors x among the ny
202  * vector y, w.r.t to max inner product
203  *
204  * @param x query vectors, size nx * d
205  * @param y database vectors, size ny * d
206  * @param res result array, which also provides k. Sorted on output
207  */
208 void knn_inner_product (
209  const float * x,
210  const float * y,
211  size_t d, size_t nx, size_t ny,
212  float_minheap_array_t * res);
213 
214 /** Same as knn_inner_product, for the L2 distance */
215 void knn_L2sqr (
216  const float * x,
217  const float * y,
218  size_t d, size_t nx, size_t ny,
219  float_maxheap_array_t * res);
220 
221 /** same as knn_L2sqr, but base_shift[bno] is subtracted to all
222  * computed distances.
223  *
224  * @param base_shift size ny
225  */
227  const float * x,
228  const float * y,
229  size_t d, size_t nx, size_t ny,
230  float_maxheap_array_t * res,
231  const float *base_shift);
232 
233 /* Find the nearest neighbors for nx queries in a set of ny vectors
234  * indexed by ids. May be useful for re-ranking a pre-selected vector list
235  */
236 void knn_inner_products_by_idx (
237  const float * x,
238  const float * y,
239  const long * ids,
240  size_t d, size_t nx, size_t ny,
241  float_minheap_array_t * res);
242 
243 void knn_L2sqr_by_idx (const float * x,
244  const float * y,
245  const long * __restrict ids,
246  size_t d, size_t nx, size_t ny,
247  float_maxheap_array_t * res);
248 
249 /***************************************************************************
250  * Range search
251  ***************************************************************************/
252 
253 
254 
255 /// Forward declaration, see AuxIndexStructures.h
256 struct RangeSearchResult;
257 
258 /** Return the k nearest neighors of each of the nx vectors x among the ny
259  * vector y, w.r.t to max inner product
260  *
261  * @param x query vectors, size nx * d
262  * @param y database vectors, size ny * d
263  * @param radius search radius around the x vectors
264  * @param result result structure
265  */
266 void range_search_L2sqr (
267  const float * x,
268  const float * y,
269  size_t d, size_t nx, size_t ny,
270  float radius,
271  RangeSearchResult *result);
272 
273 /// same as range_search_L2sqr for the inner product similarity
275  const float * x,
276  const float * y,
277  size_t d, size_t nx, size_t ny,
278  float radius,
279  RangeSearchResult *result);
280 
281 
282 
283 
284 
285 /***************************************************************************
286  * Misc matrix and vector manipulation functions
287  ***************************************************************************/
288 
289 
290 /** compute c := a + bf * b for a, b and c tables
291  *
292  * @param n size of the tables
293  * @param a size n
294  * @param b size n
295  * @param c restult table, size n
296  */
297 void fvec_madd (size_t n, const float *a,
298  float bf, const float *b, float *c);
299 
300 
301 /** same as fvec_madd, also return index of the min of the result table
302  * @return index of the min of table c
303  */
304 int fvec_madd_and_argmin (size_t n, const float *a,
305  float bf, const float *b, float *c);
306 
307 
308 /* perform a reflection (not an efficient implementation, just for test ) */
309 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
310 
311 
312 /** For k-means: update stage.
313  *
314  * @param x training vectors, size n * d
315  * @param centroids centroid vectors, size k * d
316  * @param assign nearest centroid for each training vector, size n
317  * @param k_frozen do not update the k_frozen first centroids
318  * @return nb of spliting operations to fight empty clusters
319  */
321  const float * x,
322  float * centroids,
323  long * assign,
324  size_t d, size_t k, size_t n,
325  size_t k_frozen);
326 
327 /** compute the Q of the QR decomposition for m > n
328  * @param a size n * m: input matrix and output Q
329  */
330 void matrix_qr (int m, int n, float *a);
331 
332 /** distances are supposed to be sorted. Sorts indices with same distance*/
333 void ranklist_handle_ties (int k, long *idx, const float *dis);
334 
335 /** count the number of comon elements between v1 and v2
336  * algorithm = sorting + bissection to avoid double-counting duplicates
337  */
338 size_t ranklist_intersection_size (size_t k1, const long *v1,
339  size_t k2, const long *v2);
340 
341 /** merge a result table into another one
342  *
343  * @param I0, D0 first result table, size (n, k)
344  * @param I1, D1 second result table, size (n, k)
345  * @param keep_min if true, keep min values, otherwise keep max
346  * @param translation add this value to all I1's indexes
347  * @return nb of values that were taken from the second table
348  */
349 size_t merge_result_table_with (size_t n, size_t k,
350  long *I0, float *D0,
351  const long *I1, const float *D1,
352  bool keep_min = true,
353  long translation = 0);
354 
355 
356 
357 void fvec_argsort (size_t n, const float *vals,
358  size_t *perm);
359 
360 void fvec_argsort_parallel (size_t n, const float *vals,
361  size_t *perm);
362 
363 
364 /// compute histogram on v
365 int ivec_hist (size_t n, const int * v, int vmax, int *hist);
366 
367 /** Compute histogram of bits on a code array
368  *
369  * @param codes size(n, nbits / 8)
370  * @param hist size(nbits): nb of 1s in the array of codes
371  */
372 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
373 
374 
375 /// compute a checksum on a table.
376 size_t ivec_checksum (size_t n, const int *a);
377 
378 
379 /** random subsamples a set of vectors if there are too many of them
380  *
381  * @param d dimension of the vectors
382  * @param n on input: nb of input vectors, output: nb of output vectors
383  * @param nmax max nb of vectors to keep
384  * @param x input array, size *n-by-d
385  * @param seed random seed to use for sampling
386  * @return x or an array allocated with new [] with *n vectors
387  */
388 const float *fvecs_maybe_subsample (
389  size_t d, size_t *n, size_t nmax, const float *x,
390  bool verbose = false, long seed = 1234);
391 
392 } // namspace faiss
393 
394 
395 #endif /* FAISS_utils_h */
random generator that can be used in multithreaded contexts
Definition: utils.h:48
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1401
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:988
RandomGenerator(long seed=1234)
initialize
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:574
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1649
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1975
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1495
float rand_float()
between 0 and 1
Definition: utils.cpp:211
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1889
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1638
long rand_long()
random long &lt; 2 ^ 62
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1511
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1576
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1344
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1282
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:948
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:74
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1614
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:632
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1267
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1322
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1672
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1963
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:968