Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/utils.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 /*
11  * A few utilitary functions for similarity search:
12  * - random generators
13  * - optimized exhaustive distance and knn search functions
14  * - some functions reimplemented from torch for speed
15  */
16 
17 #ifndef FAISS_utils_h
18 #define FAISS_utils_h
19 
20 #include <random>
21 
22 #include <stdint.h>
23 
24 #include "Heap.h"
25 
26 
27 namespace faiss {
28 
29 
30 /**************************************************
31  * Get some stats about the system
32 **************************************************/
33 
34 
35 /// ms elapsed since some arbitrary epoch
36 double getmillisecs ();
37 
38 /// get current RSS usage in kB
39 size_t get_mem_usage_kb ();
40 
41 
42 /**************************************************
43  * Random data generation functions
44  **************************************************/
45 
46 /// random generator that can be used in multithreaded contexts
48 
49  std::mt19937 mt;
50 
51  /// random positive integer
52  int rand_int ();
53 
54  /// random long
55  long rand_long ();
56 
57  /// generate random integer between 0 and max-1
58  int rand_int (int max);
59 
60  /// between 0 and 1
61  float rand_float ();
62 
63  double rand_double ();
64 
65  explicit RandomGenerator (long seed = 1234);
66 };
67 
68 /* Generate an array of uniform random floats / multi-threaded implementation */
69 void float_rand (float * x, size_t n, long seed);
70 void float_randn (float * x, size_t n, long seed);
71 void long_rand (long * x, size_t n, long seed);
72 void byte_rand (uint8_t * x, size_t n, long seed);
73 
74 /* random permutation */
75 void rand_perm (int * perm, size_t n, long seed);
76 
77 
78 
79  /*********************************************************
80  * Optimized distance/norm/inner prod computations
81  *********************************************************/
82 
83 
84 /// Squared L2 distance between two vectors
85 float fvec_L2sqr (
86  const float * x,
87  const float * y,
88  size_t d);
89 
90 /* SSE-implementation of inner product and L2 distance */
91 float fvec_inner_product (
92  const float * x,
93  const float * y,
94  size_t d);
95 
96 
97 /// a balanced assignment has a IF of 1
98 double imbalance_factor (int n, int k, const long *assign);
99 
100 /// same, takes a histogram as input
101 double imbalance_factor (int k, const int *hist);
102 
103 /** Compute pairwise distances between sets of vectors
104  *
105  * @param d dimension of the vectors
106  * @param nq nb of query vectors
107  * @param nb nb of database vectors
108  * @param xq query vectors (size nq * d)
109  * @param xb database vectros (size nb * d)
110  * @param dis output distances (size nq * nb)
111  * @param ldq,ldb, ldd strides for the matrices
112  */
113 void pairwise_L2sqr (long d,
114  long nq, const float *xq,
115  long nb, const float *xb,
116  float *dis,
117  long ldq = -1, long ldb = -1, long ldd = -1);
118 
119 
120 /* compute the inner product between nx vectors x and one y */
121 void fvec_inner_products_ny (
122  float * ip, /* output inner product */
123  const float * x,
124  const float * y,
125  size_t d, size_t ny);
126 
127 /* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
128 void fvec_L2sqr_ny (
129  float * __restrict dis,
130  const float * x,
131  const float * y,
132  size_t d, size_t ny);
133 
134 
135 /** squared norm of a vector */
136 float fvec_norm_L2sqr (const float * x,
137  size_t d);
138 
139 /** compute the L2 norms for a set of vectors
140  *
141  * @param ip output norms, size nx
142  * @param x set of vectors, size nx * d
143  */
144 void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
145 
146 /// same as fvec_norms_L2, but computes square norms
147 void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
148 
149 /* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
150 void fvec_renorm_L2 (size_t d, size_t nx, float * x);
151 
152 
153 /* This function exists because the Torch counterpart is extremly slow
154  (not multi-threaded + unexpected overhead even in single thread).
155  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
156 void inner_product_to_L2sqr (float * __restrict dis,
157  const float * nr1,
158  const float * nr2,
159  size_t n1, size_t n2);
160 
161 /***************************************************************************
162  * Compute a subset of distances
163  ***************************************************************************/
164 
165  /* compute the inner product between x and a subset y of ny vectors,
166  whose indices are given by idy. */
167 void fvec_inner_products_by_idx (
168  float * __restrict ip,
169  const float * x,
170  const float * y,
171  const long * __restrict ids,
172  size_t d, size_t nx, size_t ny);
173 
174 /* same but for a subset in y indexed by idsy (ny vectors in total) */
175 void fvec_L2sqr_by_idx (
176  float * __restrict dis,
177  const float * x,
178  const float * y,
179  const long * __restrict ids, /* ids of y vecs */
180  size_t d, size_t nx, size_t ny);
181 
182 /***************************************************************************
183  * KNN functions
184  ***************************************************************************/
185 
186 // threshold on nx above which we switch to BLAS to compute distances
187 extern int distance_compute_blas_threshold;
188 
189 /** Return the k nearest neighors of each of the nx vectors x among the ny
190  * vector y, w.r.t to max inner product
191  *
192  * @param x query vectors, size nx * d
193  * @param y database vectors, size ny * d
194  * @param res result array, which also provides k. Sorted on output
195  */
196 void knn_inner_product (
197  const float * x,
198  const float * y,
199  size_t d, size_t nx, size_t ny,
200  float_minheap_array_t * res);
201 
202 /** Same as knn_inner_product, for the L2 distance */
203 void knn_L2sqr (
204  const float * x,
205  const float * y,
206  size_t d, size_t nx, size_t ny,
207  float_maxheap_array_t * res);
208 
209 /** same as knn_L2sqr, but base_shift[bno] is subtracted to all
210  * computed distances.
211  *
212  * @param base_shift size ny
213  */
215  const float * x,
216  const float * y,
217  size_t d, size_t nx, size_t ny,
218  float_maxheap_array_t * res,
219  const float *base_shift);
220 
221 /* Find the nearest neighbors for nx queries in a set of ny vectors
222  * indexed by ids. May be useful for re-ranking a pre-selected vector list
223  */
224 void knn_inner_products_by_idx (
225  const float * x,
226  const float * y,
227  const long * ids,
228  size_t d, size_t nx, size_t ny,
229  float_minheap_array_t * res);
230 
231 void knn_L2sqr_by_idx (const float * x,
232  const float * y,
233  const long * __restrict ids,
234  size_t d, size_t nx, size_t ny,
235  float_maxheap_array_t * res);
236 
237 /***************************************************************************
238  * Range search
239  ***************************************************************************/
240 
241 
242 
243 /// Forward declaration, see AuxIndexStructures.h
244 struct RangeSearchResult;
245 
246 /** Return the k nearest neighors of each of the nx vectors x among the ny
247  * vector y, w.r.t to max inner product
248  *
249  * @param x query vectors, size nx * d
250  * @param y database vectors, size ny * d
251  * @param radius search radius around the x vectors
252  * @param result result structure
253  */
254 void range_search_L2sqr (
255  const float * x,
256  const float * y,
257  size_t d, size_t nx, size_t ny,
258  float radius,
259  RangeSearchResult *result);
260 
261 /// same as range_search_L2sqr for the inner product similarity
263  const float * x,
264  const float * y,
265  size_t d, size_t nx, size_t ny,
266  float radius,
267  RangeSearchResult *result);
268 
269 
270 
271 
272 
273 /***************************************************************************
274  * Misc matrix and vector manipulation functions
275  ***************************************************************************/
276 
277 
278 /** compute c := a + bf * b for a, b and c tables
279  *
280  * @param n size of the tables
281  * @param a size n
282  * @param b size n
283  * @param c restult table, size n
284  */
285 void fvec_madd (size_t n, const float *a,
286  float bf, const float *b, float *c);
287 
288 
289 /** same as fvec_madd, also return index of the min of the result table
290  * @return index of the min of table c
291  */
292 int fvec_madd_and_argmin (size_t n, const float *a,
293  float bf, const float *b, float *c);
294 
295 
296 /* perform a reflection (not an efficient implementation, just for test ) */
297 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
298 
299 
300 /** For k-means: update stage.
301  *
302  * @param x training vectors, size n * d
303  * @param centroids centroid vectors, size k * d
304  * @param assign nearest centroid for each training vector, size n
305  * @param k_frozen do not update the k_frozen first centroids
306  * @return nb of spliting operations to fight empty clusters
307  */
309  const float * x,
310  float * centroids,
311  long * assign,
312  size_t d, size_t k, size_t n,
313  size_t k_frozen);
314 
315 /** compute the Q of the QR decomposition for m > n
316  * @param a size n * m: input matrix and output Q
317  */
318 void matrix_qr (int m, int n, float *a);
319 
320 /** distances are supposed to be sorted. Sorts indices with same distance*/
321 void ranklist_handle_ties (int k, long *idx, const float *dis);
322 
323 /** count the number of comon elements between v1 and v2
324  * algorithm = sorting + bissection to avoid double-counting duplicates
325  */
326 size_t ranklist_intersection_size (size_t k1, const long *v1,
327  size_t k2, const long *v2);
328 
329 /** merge a result table into another one
330  *
331  * @param I0, D0 first result table, size (n, k)
332  * @param I1, D1 second result table, size (n, k)
333  * @param keep_min if true, keep min values, otherwise keep max
334  * @param translation add this value to all I1's indexes
335  * @return nb of values that were taken from the second table
336  */
337 size_t merge_result_table_with (size_t n, size_t k,
338  long *I0, float *D0,
339  const long *I1, const float *D1,
340  bool keep_min = true,
341  long translation = 0);
342 
343 
344 
345 void fvec_argsort (size_t n, const float *vals,
346  size_t *perm);
347 
348 void fvec_argsort_parallel (size_t n, const float *vals,
349  size_t *perm);
350 
351 
352 /// compute histogram on v
353 int ivec_hist (size_t n, const int * v, int vmax, int *hist);
354 
355 /** Compute histogram of bits on a code array
356  *
357  * @param codes size(n, nbits / 8)
358  * @param hist size(nbits): nb of 1s in the array of codes
359  */
360 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
361 
362 
363 /// compute a checksum on a table.
364 size_t ivec_checksum (size_t n, const int *a);
365 
366 
367 /** random subsamples a set of vectors if there are too many of them
368  *
369  * @param d dimension of the vectors
370  * @param n on input: nb of input vectors, output: nb of output vectors
371  * @param nmax max nb of vectors to keep
372  * @param x input array, size *n-by-d
373  * @param seed random seed to use for sampling
374  * @return x or an array allocated with new [] with *n vectors
375  */
376 const float *fvecs_maybe_subsample (
377  size_t d, size_t *n, size_t nmax, const float *x,
378  bool verbose = false, long seed = 1234);
379 
380 /** Convert binary vector to +1/-1 valued float vector.
381  *
382  * @param d dimension of the vector (multiple of 8)
383  * @param x_in input binary vector (uint8_t table of size d / 8)
384  * @param x_out output float vector (float table of size d)
385  */
386 void binary_to_real(size_t d, const uint8_t *x_in, float *x_out);
387 
388 /** Convert float vector to binary vector. Components > 0 are converted to 1,
389  * others to 0.
390  *
391  * @param d dimension of the vector (multiple of 8)
392  * @param x_in input float vector (float table of size d)
393  * @param x_out output binary vector (uint8_t table of size d / 8)
394  */
395 void real_to_binary(size_t d, const float *x_in, uint8_t *x_out);
396 
397 
398 /** A reasonable hashing function */
399 uint64_t hash_bytes (const uint8_t *bytes, long n);
400 
401 /** Whether OpenMP annotations were respected. */
402 bool check_openmp();
403 
404 } // namspace faiss
405 
406 
407 #endif /* FAISS_utils_h */
random generator that can be used in multithreaded contexts
Definition: utils.h:47
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1078
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:664
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:501
bool check_openmp()
Definition: utils.cpp:1596
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1326
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1540
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1172
float rand_float()
between 0 and 1
Definition: utils.cpp:129
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils_simd.cpp:588
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1315
long rand_long()
random long
Definition: utils.cpp:119
uint64_t hash_bytes(const uint8_t *bytes, long n)
Definition: utils.cpp:1584
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1188
int rand_int()
random positive integer
Definition: utils.cpp:114
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1253
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1021
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:959
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:624
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:69
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
Definition: utils.cpp:1570
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1291
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils_simd.cpp:515
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:944
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:999
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1349
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils_simd.cpp:675
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:644
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
Definition: utils.cpp:1564