Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.h
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 /** Copyright 2004-present Facebook. All Rights Reserved
11  * -*- c++ -*-
12  *
13  * A few utilitary functions for similarity search:
14  * - random generators
15  * - optimized exhaustive distance and knn search functions
16  * - some functions reimplemented from torch for speed
17  */
18 
19 #ifndef FAISS_utils_h
20 #define FAISS_utils_h
21 
22 #include <stdint.h>
23 // for the random data struct
24 #include <cstdlib>
25 
26 #include "Heap.h"
27 
28 
29 namespace faiss {
30 
31 
32 /**************************************************
33  * Get some stats about the system
34 **************************************************/
35 
36 
37 /// ms elapsed since some arbitrary epoch
38 double getmillisecs ();
39 
40 /// get current RSS usage in kB
41 size_t get_mem_usage_kb ();
42 
43 
44 /**************************************************
45  * Random data generation functions
46  **************************************************/
47 
48 /// random generator that can be used in multithreaded contexts
50 
51 #ifdef __linux__
52  char rand_state [8];
53  struct random_data rand_data;
54 #elif __APPLE__
55  unsigned rand_state;
56 #endif
57 
58  /// random 31-bit positive integer
59  int rand_int ();
60 
61  /// random long < 2 ^ 62
62  long rand_long ();
63 
64  /// generate random number between 0 and max-1
65  int rand_int (int max);
66 
67  /// between 0 and 1
68  float rand_float ();
69 
70 
71  double rand_double ();
72 
73  /// initialize
74  explicit RandomGenerator (long seed = 1234);
75 
76  /// default copy constructor messes up pointer in rand_data
77  RandomGenerator (const RandomGenerator & other);
78 
79 };
80 
81 /* Generate an array of uniform random floats / multi-threaded implementation */
82 void float_rand (float * x, size_t n, long seed);
83 void float_randn (float * x, size_t n, long seed);
84 void long_rand (long * x, size_t n, long seed);
85 void byte_rand (uint8_t * x, size_t n, long seed);
86 
87 /* random permutation */
88 void rand_perm (int * perm, size_t n, long seed);
89 
90 
91 
92  /*********************************************************
93  * Optimized distance/norm/inner prod computations
94  *********************************************************/
95 
96 
97 /// Squared L2 distance between two vectors
98 float fvec_L2sqr (
99  const float * x,
100  const float * y,
101  size_t d);
102 
103 /* SSE-implementation of inner product and L2 distance */
104 float fvec_inner_product (
105  const float * x,
106  const float * y,
107  size_t d);
108 
109 
110 /// a balanced assignment has a IF of 1
111 double imbalance_factor (int n, int k, const long *assign);
112 
113 /// same, takes a histogram as input
114 double imbalance_factor (int k, const int *hist);
115 
116 /** Compute pairwise distances between sets of vectors
117  *
118  * @param d dimension of the vectors
119  * @param nq nb of query vectors
120  * @param nb nb of database vectors
121  * @param xq query vectors (size nq * d)
122  * @param xb database vectros (size nb * d)
123  * @param dis output distances (size nq * nb)
124  * @param ldq,ldb, ldd strides for the matrices
125  */
126 void pairwise_L2sqr (long d,
127  long nq, const float *xq,
128  long nb, const float *xb,
129  float *dis,
130  long ldq = -1, long ldb = -1, long ldd = -1);
131 
132 
133 /* compute the inner product between nx vectors x and one y */
134 void fvec_inner_products_ny (
135  float * ip, /* output inner product */
136  const float * x,
137  const float * y,
138  size_t d, size_t ny);
139 
140 /* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
141 void fvec_L2sqr_ny (
142  float * __restrict dis,
143  const float * x,
144  const float * y,
145  size_t d, size_t ny);
146 
147 
148 /** squared norm of a vector */
149 float fvec_norm_L2sqr (const float * x,
150  size_t d);
151 
152 /** compute the L2 norms for a set of vectors
153  *
154  * @param ip output norms, size nx
155  * @param x set of vectors, size nx * d
156  */
157 void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
158 
159 /// same as fvec_norms_L2, but computes square norms
160 void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
161 
162 /* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
163 void fvec_renorm_L2 (size_t d, size_t nx, float * x);
164 
165 
166 /* This function exists because the Torch counterpart is extremly slow
167  (not multi-threaded + unexpected overhead even in single thread).
168  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
169 void inner_product_to_L2sqr (float * __restrict dis,
170  const float * nr1,
171  const float * nr2,
172  size_t n1, size_t n2);
173 
174 /***************************************************************************
175  * Compute a subset of distances
176  ***************************************************************************/
177 
178  /* compute the inner product between x and a subset y of ny vectors,
179  whose indices are given by idy. */
180 void fvec_inner_products_by_idx (
181  float * __restrict ip,
182  const float * x,
183  const float * y,
184  const long * __restrict ids,
185  size_t d, size_t nx, size_t ny);
186 
187 /* same but for a subset in y indexed by idsy (ny vectors in total) */
188 void fvec_L2sqr_by_idx (
189  float * __restrict dis,
190  const float * x,
191  const float * y,
192  const long * __restrict ids, /* ids of y vecs */
193  size_t d, size_t nx, size_t ny);
194 
195 /***************************************************************************
196  * KNN functions
197  ***************************************************************************/
198 
199 
200 /** Return the k nearest neighors of each of the nx vectors x among the ny
201  * vector y, w.r.t to max inner product
202  *
203  * @param x query vectors, size nx * d
204  * @param y database vectors, size ny * d
205  * @param res result array, which also provides k. Sorted on output
206  */
207 void knn_inner_product (
208  const float * x,
209  const float * y,
210  size_t d, size_t nx, size_t ny,
211  float_minheap_array_t * res);
212 
213 /** Same as knn_inner_product, for the L2 distance */
214 void knn_L2sqr (
215  const float * x,
216  const float * y,
217  size_t d, size_t nx, size_t ny,
218  float_maxheap_array_t * res);
219 
220 /** same as knn_L2sqr, but base_shift[bno] is subtracted to all
221  * computed distances.
222  *
223  * @param base_shift size ny
224  */
226  const float * x,
227  const float * y,
228  size_t d, size_t nx, size_t ny,
229  float_maxheap_array_t * res,
230  const float *base_shift);
231 
232 /* Find the nearest neighbors for nx queries in a set of ny vectors
233  * indexed by ids. May be useful for re-ranking a pre-selected vector list
234  */
235 void knn_inner_products_by_idx (
236  const float * x,
237  const float * y,
238  const long * ids,
239  size_t d, size_t nx, size_t ny,
240  float_minheap_array_t * res);
241 
242 void knn_L2sqr_by_idx (const float * x,
243  const float * y,
244  const long * __restrict ids,
245  size_t d, size_t nx, size_t ny,
246  float_maxheap_array_t * res);
247 
248 /***************************************************************************
249  * Range search
250  ***************************************************************************/
251 
252 
253 
254 /// Forward declaration, see AuxIndexStructures.h
255 struct RangeSearchResult;
256 
257 /** Return the k nearest neighors of each of the nx vectors x among the ny
258  * vector y, w.r.t to max inner product
259  *
260  * @param x query vectors, size nx * d
261  * @param y database vectors, size ny * d
262  * @param radius search radius around the x vectors
263  * @param result result structure
264  */
265 void range_search_L2sqr (
266  const float * x,
267  const float * y,
268  size_t d, size_t nx, size_t ny,
269  float radius,
270  RangeSearchResult *result);
271 
272 /// same as range_search_L2sqr for the inner product similarity
274  const float * x,
275  const float * y,
276  size_t d, size_t nx, size_t ny,
277  float radius,
278  RangeSearchResult *result);
279 
280 
281 
282 
283 
284 /***************************************************************************
285  * Misc matrix and vector manipulation functions
286  ***************************************************************************/
287 
288 
289 /** compute c := a + bf * b for a, b and c tables
290  *
291  * @param n size of the tables
292  * @param a size n
293  * @param b size n
294  * @param c restult table, size n
295  */
296 void fvec_madd (size_t n, const float *a,
297  float bf, const float *b, float *c);
298 
299 
300 /** same as fvec_madd, also return index of the min of the result table
301  * @return index of the min of table c
302  */
303 int fvec_madd_and_argmin (size_t n, const float *a,
304  float bf, const float *b, float *c);
305 
306 
307 /* perform a reflection (not an efficient implementation, just for test ) */
308 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
309 
310 
311 /** For k-means: update stage. Returns nb of split clusters. */
313  const float * x,
314  float * centroids,
315  long * assign,
316  size_t d, size_t k, size_t n);
317 
318 /** compute the Q of the QR decomposition for m > n
319  * @param a size n * m: input matrix and output Q
320  */
321 void matrix_qr (int m, int n, float *a);
322 
323 /** distances are supposed to be sorted. Sorts indices with same distance*/
324 void ranklist_handle_ties (int k, long *idx, const float *dis);
325 
326 /** count the number of comon elements between v1 and v2
327  * algorithm = sorting + bissection to avoid double-counting duplicates
328  */
329 size_t ranklist_intersection_size (size_t k1, const long *v1,
330  size_t k2, const long *v2);
331 
332 
333 void fvec_argsort (size_t n, const float *vals,
334  size_t *perm);
335 
336 void fvec_argsort_parallel (size_t n, const float *vals,
337  size_t *perm);
338 
339 
340 /// compute histogram on v
341 int ivec_hist (size_t n, const int * v, int vmax, int *hist);
342 
343 /** Compute histogram of bits on a code array
344  *
345  * @param codes size(n, nbits / 8)
346  * @param hist size(nbits): nb of 1s in the array of codes
347  */
348 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
349 
350 
351 /// compute a checksum on a table.
352 size_t ivec_checksum (size_t n, const int *a);
353 
354 
355 /** random subsamples a set of vectors if there are too many of them
356  *
357  * @param d dimension of the vectors
358  * @param n on input: nb of input vectors, output: nb of output vectors
359  * @param nmax max nb of vectors to keep
360  * @param x input array, size *n-by-d
361  * @param seed random seed to use for sampling
362  * @return x or an array allocated with new [] with *n vectors
363  */
364 const float *fvecs_maybe_subsample (
365  size_t d, size_t *n, size_t nmax, const float *x,
366  bool verbose = false, long seed = 1234);
367 
368 } // namspace faiss
369 
370 
371 #endif /* FAISS_utils_h */
random generator that can be used in multithreaded contexts
Definition: utils.h:49
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:872
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
Definition: utils.cpp:1286
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:432
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1467
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1793
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1378
float rand_float()
between 0 and 1
Definition: utils.cpp:208
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1707
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1456
long rand_long()
random long &lt; 2 ^ 62
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1394
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1228
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1166
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:832
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:71
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1432
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:513
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1151
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1206
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1490
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1781
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:852