Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/utils.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 /*
12  * A few utilitary functions for similarity search:
13  * - random generators
14  * - optimized exhaustive distance and knn search functions
15  * - some functions reimplemented from torch for speed
16  */
17 
18 #ifndef FAISS_utils_h
19 #define FAISS_utils_h
20 
21 #include <random>
22 
23 #include <stdint.h>
24 
25 #include "Heap.h"
26 
27 
28 namespace faiss {
29 
30 
31 /**************************************************
32  * Get some stats about the system
33 **************************************************/
34 
35 
36 /// ms elapsed since some arbitrary epoch
37 double getmillisecs ();
38 
39 /// get current RSS usage in kB
40 size_t get_mem_usage_kb ();
41 
42 
43 /**************************************************
44  * Random data generation functions
45  **************************************************/
46 
47 /// random generator that can be used in multithreaded contexts
49 
50  std::mt19937 mt;
51 
52  /// random positive integer
53  int rand_int ();
54 
55  /// random long
56  long rand_long ();
57 
58  /// generate random integer between 0 and max-1
59  int rand_int (int max);
60 
61  /// between 0 and 1
62  float rand_float ();
63 
64  double rand_double ();
65 
66  explicit RandomGenerator (long seed = 1234);
67 };
68 
69 /* Generate an array of uniform random floats / multi-threaded implementation */
70 void float_rand (float * x, size_t n, long seed);
71 void float_randn (float * x, size_t n, long seed);
72 void long_rand (long * x, size_t n, long seed);
73 void byte_rand (uint8_t * x, size_t n, long seed);
74 
75 /* random permutation */
76 void rand_perm (int * perm, size_t n, long seed);
77 
78 
79 
80  /*********************************************************
81  * Optimized distance/norm/inner prod computations
82  *********************************************************/
83 
84 
85 /// Squared L2 distance between two vectors
86 float fvec_L2sqr (
87  const float * x,
88  const float * y,
89  size_t d);
90 
91 /* SSE-implementation of inner product and L2 distance */
92 float fvec_inner_product (
93  const float * x,
94  const float * y,
95  size_t d);
96 
97 
98 /// a balanced assignment has a IF of 1
99 double imbalance_factor (int n, int k, const long *assign);
100 
101 /// same, takes a histogram as input
102 double imbalance_factor (int k, const int *hist);
103 
104 /** Compute pairwise distances between sets of vectors
105  *
106  * @param d dimension of the vectors
107  * @param nq nb of query vectors
108  * @param nb nb of database vectors
109  * @param xq query vectors (size nq * d)
110  * @param xb database vectros (size nb * d)
111  * @param dis output distances (size nq * nb)
112  * @param ldq,ldb, ldd strides for the matrices
113  */
114 void pairwise_L2sqr (long d,
115  long nq, const float *xq,
116  long nb, const float *xb,
117  float *dis,
118  long ldq = -1, long ldb = -1, long ldd = -1);
119 
120 
121 /* compute the inner product between nx vectors x and one y */
122 void fvec_inner_products_ny (
123  float * ip, /* output inner product */
124  const float * x,
125  const float * y,
126  size_t d, size_t ny);
127 
128 /* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
129 void fvec_L2sqr_ny (
130  float * __restrict dis,
131  const float * x,
132  const float * y,
133  size_t d, size_t ny);
134 
135 
136 /** squared norm of a vector */
137 float fvec_norm_L2sqr (const float * x,
138  size_t d);
139 
140 /** compute the L2 norms for a set of vectors
141  *
142  * @param ip output norms, size nx
143  * @param x set of vectors, size nx * d
144  */
145 void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
146 
147 /// same as fvec_norms_L2, but computes square norms
148 void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
149 
150 /* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
151 void fvec_renorm_L2 (size_t d, size_t nx, float * x);
152 
153 
154 /* This function exists because the Torch counterpart is extremly slow
155  (not multi-threaded + unexpected overhead even in single thread).
156  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
157 void inner_product_to_L2sqr (float * __restrict dis,
158  const float * nr1,
159  const float * nr2,
160  size_t n1, size_t n2);
161 
162 /***************************************************************************
163  * Compute a subset of distances
164  ***************************************************************************/
165 
166  /* compute the inner product between x and a subset y of ny vectors,
167  whose indices are given by idy. */
168 void fvec_inner_products_by_idx (
169  float * __restrict ip,
170  const float * x,
171  const float * y,
172  const long * __restrict ids,
173  size_t d, size_t nx, size_t ny);
174 
175 /* same but for a subset in y indexed by idsy (ny vectors in total) */
176 void fvec_L2sqr_by_idx (
177  float * __restrict dis,
178  const float * x,
179  const float * y,
180  const long * __restrict ids, /* ids of y vecs */
181  size_t d, size_t nx, size_t ny);
182 
183 /***************************************************************************
184  * KNN functions
185  ***************************************************************************/
186 
187 // threshold on nx above which we switch to BLAS to compute distances
188 extern int distance_compute_blas_threshold;
189 
190 /** Return the k nearest neighors of each of the nx vectors x among the ny
191  * vector y, w.r.t to max inner product
192  *
193  * @param x query vectors, size nx * d
194  * @param y database vectors, size ny * d
195  * @param res result array, which also provides k. Sorted on output
196  */
197 void knn_inner_product (
198  const float * x,
199  const float * y,
200  size_t d, size_t nx, size_t ny,
201  float_minheap_array_t * res);
202 
203 /** Same as knn_inner_product, for the L2 distance */
204 void knn_L2sqr (
205  const float * x,
206  const float * y,
207  size_t d, size_t nx, size_t ny,
208  float_maxheap_array_t * res);
209 
210 /** same as knn_L2sqr, but base_shift[bno] is subtracted to all
211  * computed distances.
212  *
213  * @param base_shift size ny
214  */
216  const float * x,
217  const float * y,
218  size_t d, size_t nx, size_t ny,
219  float_maxheap_array_t * res,
220  const float *base_shift);
221 
222 /* Find the nearest neighbors for nx queries in a set of ny vectors
223  * indexed by ids. May be useful for re-ranking a pre-selected vector list
224  */
225 void knn_inner_products_by_idx (
226  const float * x,
227  const float * y,
228  const long * ids,
229  size_t d, size_t nx, size_t ny,
230  float_minheap_array_t * res);
231 
232 void knn_L2sqr_by_idx (const float * x,
233  const float * y,
234  const long * __restrict ids,
235  size_t d, size_t nx, size_t ny,
236  float_maxheap_array_t * res);
237 
238 /***************************************************************************
239  * Range search
240  ***************************************************************************/
241 
242 
243 
244 /// Forward declaration, see AuxIndexStructures.h
245 struct RangeSearchResult;
246 
247 /** Return the k nearest neighors of each of the nx vectors x among the ny
248  * vector y, w.r.t to max inner product
249  *
250  * @param x query vectors, size nx * d
251  * @param y database vectors, size ny * d
252  * @param radius search radius around the x vectors
253  * @param result result structure
254  */
255 void range_search_L2sqr (
256  const float * x,
257  const float * y,
258  size_t d, size_t nx, size_t ny,
259  float radius,
260  RangeSearchResult *result);
261 
262 /// same as range_search_L2sqr for the inner product similarity
264  const float * x,
265  const float * y,
266  size_t d, size_t nx, size_t ny,
267  float radius,
268  RangeSearchResult *result);
269 
270 
271 
272 
273 
274 /***************************************************************************
275  * Misc matrix and vector manipulation functions
276  ***************************************************************************/
277 
278 
279 /** compute c := a + bf * b for a, b and c tables
280  *
281  * @param n size of the tables
282  * @param a size n
283  * @param b size n
284  * @param c restult table, size n
285  */
286 void fvec_madd (size_t n, const float *a,
287  float bf, const float *b, float *c);
288 
289 
290 /** same as fvec_madd, also return index of the min of the result table
291  * @return index of the min of table c
292  */
293 int fvec_madd_and_argmin (size_t n, const float *a,
294  float bf, const float *b, float *c);
295 
296 
297 /* perform a reflection (not an efficient implementation, just for test ) */
298 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
299 
300 
301 /** For k-means: update stage.
302  *
303  * @param x training vectors, size n * d
304  * @param centroids centroid vectors, size k * d
305  * @param assign nearest centroid for each training vector, size n
306  * @param k_frozen do not update the k_frozen first centroids
307  * @return nb of spliting operations to fight empty clusters
308  */
310  const float * x,
311  float * centroids,
312  long * assign,
313  size_t d, size_t k, size_t n,
314  size_t k_frozen);
315 
316 /** compute the Q of the QR decomposition for m > n
317  * @param a size n * m: input matrix and output Q
318  */
319 void matrix_qr (int m, int n, float *a);
320 
321 /** distances are supposed to be sorted. Sorts indices with same distance*/
322 void ranklist_handle_ties (int k, long *idx, const float *dis);
323 
324 /** count the number of comon elements between v1 and v2
325  * algorithm = sorting + bissection to avoid double-counting duplicates
326  */
327 size_t ranklist_intersection_size (size_t k1, const long *v1,
328  size_t k2, const long *v2);
329 
330 /** merge a result table into another one
331  *
332  * @param I0, D0 first result table, size (n, k)
333  * @param I1, D1 second result table, size (n, k)
334  * @param keep_min if true, keep min values, otherwise keep max
335  * @param translation add this value to all I1's indexes
336  * @return nb of values that were taken from the second table
337  */
338 size_t merge_result_table_with (size_t n, size_t k,
339  long *I0, float *D0,
340  const long *I1, const float *D1,
341  bool keep_min = true,
342  long translation = 0);
343 
344 
345 
346 void fvec_argsort (size_t n, const float *vals,
347  size_t *perm);
348 
349 void fvec_argsort_parallel (size_t n, const float *vals,
350  size_t *perm);
351 
352 
353 /// compute histogram on v
354 int ivec_hist (size_t n, const int * v, int vmax, int *hist);
355 
356 /** Compute histogram of bits on a code array
357  *
358  * @param codes size(n, nbits / 8)
359  * @param hist size(nbits): nb of 1s in the array of codes
360  */
361 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
362 
363 
364 /// compute a checksum on a table.
365 size_t ivec_checksum (size_t n, const int *a);
366 
367 
368 /** random subsamples a set of vectors if there are too many of them
369  *
370  * @param d dimension of the vectors
371  * @param n on input: nb of input vectors, output: nb of output vectors
372  * @param nmax max nb of vectors to keep
373  * @param x input array, size *n-by-d
374  * @param seed random seed to use for sampling
375  * @return x or an array allocated with new [] with *n vectors
376  */
377 const float *fvecs_maybe_subsample (
378  size_t d, size_t *n, size_t nmax, const float *x,
379  bool verbose = false, long seed = 1234);
380 
381 /** Convert binary vector to +1/-1 valued float vector.
382  *
383  * @param d dimension of the vector (multiple of 8)
384  * @param x_in input binary vector (uint8_t table of size d / 8)
385  * @param x_out output float vector (float table of size d)
386  */
387 void binary_to_real(size_t d, const uint8_t *x_in, float *x_out);
388 
389 /** Convert float vector to binary vector. Components > 0 are converted to 1,
390  * others to 0.
391  *
392  * @param d dimension of the vector (multiple of 8)
393  * @param x_in input float vector (float table of size d)
394  * @param x_out output binary vector (uint8_t table of size d / 8)
395  */
396 void real_to_binary(size_t d, const float *x_in, uint8_t *x_out);
397 
398 
399 } // namspace faiss
400 
401 
402 #endif /* FAISS_utils_h */
random generator that can be used in multithreaded contexts
Definition: utils.h:48
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1066
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:653
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:502
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1314
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1528
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1160
float rand_float()
between 0 and 1
Definition: utils.cpp:130
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils_simd.cpp:589
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1303
long rand_long()
random long
Definition: utils.cpp:120
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1176
int rand_int()
random positive integer
Definition: utils.cpp:115
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1241
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1009
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:947
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:613
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:70
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
Definition: utils.cpp:1558
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1279
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils_simd.cpp:516
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:932
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:987
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1337
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils_simd.cpp:676
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:633
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
Definition: utils.cpp:1552