Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/utils.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #include "utils.h"
12 
13 #include <cstdio>
14 #include <cassert>
15 #include <cstring>
16 #include <cmath>
17 
18 #include <sys/time.h>
19 #include <sys/types.h>
20 #include <unistd.h>
21 
22 #include <omp.h>
23 
24 #include <immintrin.h>
25 
26 #include <algorithm>
27 #include <vector>
28 
29 #include "AuxIndexStructures.h"
30 #include "FaissAssert.h"
31 
32 
33 
34 #ifndef FINTEGER
35 #define FINTEGER long
36 #endif
37 
38 
39 extern "C" {
40 
41 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
42 
43 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
44  n, FINTEGER *k, const float *alpha, const float *a,
45  FINTEGER *lda, const float *b, FINTEGER *
46  ldb, float *beta, float *c, FINTEGER *ldc);
47 
48 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
49 
50 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
51  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
52 
53 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
54  FINTEGER *lda, float *tau, float *work,
55  FINTEGER *lwork, FINTEGER *info);
56 
57 int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
58  const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
59  float *beta, float *y, FINTEGER *incy);
60 
61 }
62 
63 
64 /**************************************************
65  * Get some stats about the system
66  **************************************************/
67 
68 namespace faiss {
69 
70 double getmillisecs () {
71  struct timeval tv;
72  gettimeofday (&tv, nullptr);
73  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
74 }
75 
76 
77 #ifdef __linux__
78 
79 size_t get_mem_usage_kb ()
80 {
81  int pid = getpid ();
82  char fname[256];
83  snprintf (fname, 256, "/proc/%d/status", pid);
84  FILE * f = fopen (fname, "r");
85  FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
86  size_t sz = 0;
87  for (;;) {
88  char buf [256];
89  if (!fgets (buf, 256, f)) break;
90  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
91  }
92  fclose (f);
93  return sz;
94 }
95 
96 #elif __APPLE__
97 
98 size_t get_mem_usage_kb ()
99 {
100  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
101  return 0;
102 }
103 
104 #endif
105 
106 
107 
108 /**************************************************
109  * Random data generation functions
110  **************************************************/
111 
112 RandomGenerator::RandomGenerator (long seed)
113  : mt((unsigned int)seed) {}
114 
115 int RandomGenerator::rand_int ()
116 {
117  return mt() & 0x7fffffff;
118 }
119 
120 long RandomGenerator::rand_long ()
121 {
122  return long(rand_int()) | long(rand_int()) << 31;
123 }
124 
125 int RandomGenerator::rand_int (int max)
126 {
127  return mt() % max;
128 }
129 
130 float RandomGenerator::rand_float ()
131 {
132  return mt() / float(mt.max());
133 }
134 
135 double RandomGenerator::rand_double ()
136 {
137  return mt() / double(mt.max());
138 }
139 
140 
141 /***********************************************************************
142  * Random functions in this C file only exist because Torch
143  * counterparts are slow and not multi-threaded. Typical use is for
144  * more than 1-100 billion values. */
145 
146 
147 /* Generate a set of random floating point values such that x[i] in [0,1]
148  multi-threading. For this reason, we rely on re-entreant functions. */
149 void float_rand (float * x, size_t n, long seed)
150 {
151  // only try to parallelize on large enough arrays
152  const size_t nblock = n < 1024 ? 1 : 1024;
153 
154  RandomGenerator rng0 (seed);
155  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
156 
157 #pragma omp parallel for
158  for (size_t j = 0; j < nblock; j++) {
159 
160  RandomGenerator rng (a0 + j * b0);
161 
162  const size_t istart = j * n / nblock;
163  const size_t iend = (j + 1) * n / nblock;
164 
165  for (size_t i = istart; i < iend; i++)
166  x[i] = rng.rand_float ();
167  }
168 }
169 
170 
171 void float_randn (float * x, size_t n, long seed)
172 {
173  // only try to parallelize on large enough arrays
174  const size_t nblock = n < 1024 ? 1 : 1024;
175 
176  RandomGenerator rng0 (seed);
177  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
178 
179 #pragma omp parallel for
180  for (size_t j = 0; j < nblock; j++) {
181  RandomGenerator rng (a0 + j * b0);
182 
183  double a = 0, b = 0, s = 0;
184  int state = 0; /* generate two number per "do-while" loop */
185 
186  const size_t istart = j * n / nblock;
187  const size_t iend = (j + 1) * n / nblock;
188 
189  for (size_t i = istart; i < iend; i++) {
190  /* Marsaglia's method (see Knuth) */
191  if (state == 0) {
192  do {
193  a = 2.0 * rng.rand_double () - 1;
194  b = 2.0 * rng.rand_double () - 1;
195  s = a * a + b * b;
196  } while (s >= 1.0);
197  x[i] = a * sqrt(-2.0 * log(s) / s);
198  }
199  else
200  x[i] = b * sqrt(-2.0 * log(s) / s);
201  state = 1 - state;
202  }
203  }
204 }
205 
206 
207 /* Integer versions */
208 void long_rand (long * x, size_t n, long seed)
209 {
210  // only try to parallelize on large enough arrays
211  const size_t nblock = n < 1024 ? 1 : 1024;
212 
213  RandomGenerator rng0 (seed);
214  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
215 
216 #pragma omp parallel for
217  for (size_t j = 0; j < nblock; j++) {
218 
219  RandomGenerator rng (a0 + j * b0);
220 
221  const size_t istart = j * n / nblock;
222  const size_t iend = (j + 1) * n / nblock;
223  for (size_t i = istart; i < iend; i++)
224  x[i] = rng.rand_long ();
225  }
226 }
227 
228 
229 
230 void rand_perm (int *perm, size_t n, long seed)
231 {
232  for (size_t i = 0; i < n; i++) perm[i] = i;
233 
234  RandomGenerator rng (seed);
235 
236  for (size_t i = 0; i + 1 < n; i++) {
237  int i2 = i + rng.rand_int (n - i);
238  std::swap(perm[i], perm[i2]);
239  }
240 }
241 
242 
243 
244 
245 void byte_rand (uint8_t * x, size_t n, long seed)
246 {
247  // only try to parallelize on large enough arrays
248  const size_t nblock = n < 1024 ? 1 : 1024;
249 
250  RandomGenerator rng0 (seed);
251  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
252 
253 #pragma omp parallel for
254  for (size_t j = 0; j < nblock; j++) {
255 
256  RandomGenerator rng (a0 + j * b0);
257 
258  const size_t istart = j * n / nblock;
259  const size_t iend = (j + 1) * n / nblock;
260 
261  size_t i;
262  for (i = istart; i < iend; i++)
263  x[i] = rng.rand_long ();
264  }
265 }
266 
267 
268 
269 void reflection (const float * __restrict u,
270  float * __restrict x,
271  size_t n, size_t d, size_t nu)
272 {
273  size_t i, j, l;
274  for (i = 0; i < n; i++) {
275  const float * up = u;
276  for (l = 0; l < nu; l++) {
277  float ip1 = 0, ip2 = 0;
278 
279  for (j = 0; j < d; j+=2) {
280  ip1 += up[j] * x[j];
281  ip2 += up[j+1] * x[j+1];
282  }
283  float ip = 2 * (ip1 + ip2);
284 
285  for (j = 0; j < d; j++)
286  x[j] -= ip * up[j];
287  up += d;
288  }
289  x += d;
290  }
291 }
292 
293 
294 /* Reference implementation (slower) */
295 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
296 {
297  size_t i, j, l;
298  for (i = 0; i < n; i++) {
299  const float * up = u;
300  for (l = 0; l < nu; l++) {
301  double ip = 0;
302 
303  for (j = 0; j < d; j++)
304  ip += up[j] * x[j];
305  ip *= 2;
306 
307  for (j = 0; j < d; j++)
308  x[j] -= ip * up[j];
309 
310  up += d;
311  }
312  x += d;
313  }
314 }
315 
316 
317 
318 
319 
320 /***************************************************************************
321  * Matrix/vector ops
322  ***************************************************************************/
323 
324 
325 
326 /* Compute the inner product between a vector x and
327  a set of ny vectors y.
328  These functions are not intended to replace BLAS matrix-matrix, as they
329  would be significantly less efficient in this case. */
330 void fvec_inner_products_ny (float * ip,
331  const float * x,
332  const float * y,
333  size_t d, size_t ny)
334 {
335  // Not sure which one is fastest
336 #if 0
337  {
338  FINTEGER di = d;
339  FINTEGER nyi = ny;
340  float one = 1.0, zero = 0.0;
341  FINTEGER onei = 1;
342  sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
343  }
344 #endif
345  for (size_t i = 0; i < ny; i++) {
346  ip[i] = fvec_inner_product (x, y, d);
347  y += d;
348  }
349 }
350 
351 
352 
353 
354 
355 /* Compute the L2 norm of a set of nx vectors */
356 void fvec_norms_L2 (float * __restrict nr,
357  const float * __restrict x,
358  size_t d, size_t nx)
359 {
360 
361 #pragma omp parallel for
362  for (size_t i = 0; i < nx; i++) {
363  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
364  }
365 }
366 
367 void fvec_norms_L2sqr (float * __restrict nr,
368  const float * __restrict x,
369  size_t d, size_t nx)
370 {
371 #pragma omp parallel for
372  for (size_t i = 0; i < nx; i++)
373  nr[i] = fvec_norm_L2sqr (x + i * d, d);
374 }
375 
376 
377 
378 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
379 {
380 #pragma omp parallel for
381  for (size_t i = 0; i < nx; i++) {
382  float * __restrict xi = x + i * d;
383 
384  float nr = fvec_norm_L2sqr (xi, d);
385 
386  if (nr > 0) {
387  size_t j;
388  const float inv_nr = 1.0 / sqrtf (nr);
389  for (j = 0; j < d; j++)
390  xi[j] *= inv_nr;
391  }
392  }
393 }
394 
395 
396 
397 
398 
399 
400 
401 
402 
403 
404 
405 
406 
407 
408 
409 
410 
411 /***************************************************************************
412  * KNN functions
413  ***************************************************************************/
414 
415 
416 
417 /* Find the nearest neighbors for nx queries in a set of ny vectors */
418 static void knn_inner_product_sse (const float * x,
419  const float * y,
420  size_t d, size_t nx, size_t ny,
421  float_minheap_array_t * res)
422 {
423  size_t k = res->k;
424 
425 #pragma omp parallel for
426  for (size_t i = 0; i < nx; i++) {
427  const float * x_i = x + i * d;
428  const float * y_j = y;
429 
430  float * __restrict simi = res->get_val(i);
431  long * __restrict idxi = res->get_ids (i);
432 
433  minheap_heapify (k, simi, idxi);
434 
435  for (size_t j = 0; j < ny; j++) {
436  float ip = fvec_inner_product (x_i, y_j, d);
437 
438  if (ip > simi[0]) {
439  minheap_pop (k, simi, idxi);
440  minheap_push (k, simi, idxi, ip, j);
441  }
442  y_j += d;
443  }
444  minheap_reorder (k, simi, idxi);
445  }
446 
447 }
448 
449 static void knn_L2sqr_sse (
450  const float * x,
451  const float * y,
452  size_t d, size_t nx, size_t ny,
453  float_maxheap_array_t * res)
454 {
455  size_t k = res->k;
456 
457 #pragma omp parallel for
458  for (size_t i = 0; i < nx; i++) {
459  const float * x_i = x + i * d;
460  const float * y_j = y;
461  size_t j;
462  float * __restrict simi = res->get_val(i);
463  long * __restrict idxi = res->get_ids (i);
464 
465  maxheap_heapify (k, simi, idxi);
466  for (j = 0; j < ny; j++) {
467  float disij = fvec_L2sqr (x_i, y_j, d);
468 
469  if (disij < simi[0]) {
470  maxheap_pop (k, simi, idxi);
471  maxheap_push (k, simi, idxi, disij, j);
472  }
473  y_j += d;
474  }
475  maxheap_reorder (k, simi, idxi);
476  }
477 
478 }
479 
480 
481 /** Find the nearest neighbors for nx queries in a set of ny vectors */
482 static void knn_inner_product_blas (
483  const float * x,
484  const float * y,
485  size_t d, size_t nx, size_t ny,
486  float_minheap_array_t * res)
487 {
488  res->heapify ();
489 
490  // BLAS does not like empty matrices
491  if (nx == 0 || ny == 0) return;
492 
493  /* block sizes */
494  const size_t bs_x = 4096, bs_y = 1024;
495  // const size_t bs_x = 16, bs_y = 16;
496  float *ip_block = new float[bs_x * bs_y];
497 
498  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
499  size_t i1 = i0 + bs_x;
500  if(i1 > nx) i1 = nx;
501 
502  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
503  size_t j1 = j0 + bs_y;
504  if (j1 > ny) j1 = ny;
505  /* compute the actual dot products */
506  {
507  float one = 1, zero = 0;
508  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
509  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
510  y + j0 * d, &di,
511  x + i0 * d, &di, &zero,
512  ip_block, &nyi);
513  }
514 
515  /* collect maxima */
516  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
517  }
518  }
519  delete [] ip_block;
520  res->reorder ();
521 }
522 
523 // distance correction is an operator that can be applied to transform
524 // the distances
525 template<class DistanceCorrection>
526 static void knn_L2sqr_blas (const float * x,
527  const float * y,
528  size_t d, size_t nx, size_t ny,
529  float_maxheap_array_t * res,
530  const DistanceCorrection &corr)
531 {
532  res->heapify ();
533 
534  // BLAS does not like empty matrices
535  if (nx == 0 || ny == 0) return;
536 
537  size_t k = res->k;
538 
539  /* block sizes */
540  const size_t bs_x = 4096, bs_y = 1024;
541  // const size_t bs_x = 16, bs_y = 16;
542  float *ip_block = new float[bs_x * bs_y];
543 
544  float *x_norms = new float[nx];
545  fvec_norms_L2sqr (x_norms, x, d, nx);
546 
547  float *y_norms = new float[ny];
548  fvec_norms_L2sqr (y_norms, y, d, ny);
549 
550  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
551  size_t i1 = i0 + bs_x;
552  if(i1 > nx) i1 = nx;
553 
554  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
555  size_t j1 = j0 + bs_y;
556  if (j1 > ny) j1 = ny;
557  /* compute the actual dot products */
558  {
559  float one = 1, zero = 0;
560  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
561  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
562  y + j0 * d, &di,
563  x + i0 * d, &di, &zero,
564  ip_block, &nyi);
565  }
566 
567  /* collect minima */
568 #pragma omp parallel for
569  for (size_t i = i0; i < i1; i++) {
570  float * __restrict simi = res->get_val(i);
571  long * __restrict idxi = res->get_ids (i);
572  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
573 
574  for (size_t j = j0; j < j1; j++) {
575  float ip = *ip_line++;
576  float dis = x_norms[i] + y_norms[j] - 2 * ip;
577 
578  // negative values can occur for identical vectors
579  // due to roundoff errors
580  if (dis < 0) dis = 0;
581 
582  dis = corr (dis, i, j);
583 
584  if (dis < simi[0]) {
585  maxheap_pop (k, simi, idxi);
586  maxheap_push (k, simi, idxi, dis, j);
587  }
588  }
589  }
590  }
591  }
592  res->reorder ();
593 
594  delete [] ip_block;
595  delete [] x_norms;
596  delete [] y_norms;
597 }
598 
599 
600 
601 
602 
603 
604 
605 
606 
607 /*******************************************************
608  * KNN driver functions
609  *******************************************************/
610 
611 int distance_compute_blas_threshold = 20;
612 
613 void knn_inner_product (const float * x,
614  const float * y,
615  size_t d, size_t nx, size_t ny,
616  float_minheap_array_t * res)
617 {
618  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
619  knn_inner_product_sse (x, y, d, nx, ny, res);
620  } else {
621  knn_inner_product_blas (x, y, d, nx, ny, res);
622  }
623 }
624 
625 
626 
628  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
629  return dis;
630  }
631 };
632 
633 void knn_L2sqr (const float * x,
634  const float * y,
635  size_t d, size_t nx, size_t ny,
636  float_maxheap_array_t * res)
637 {
638  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
639  knn_L2sqr_sse (x, y, d, nx, ny, res);
640  } else {
642  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
643  }
644 }
645 
647  const float *base_shift;
648  float operator()(float dis, size_t /*qno*/, size_t bno) const {
649  return dis - base_shift[bno];
650  }
651 };
652 
654  const float * x,
655  const float * y,
656  size_t d, size_t nx, size_t ny,
657  float_maxheap_array_t * res,
658  const float *base_shift)
659 {
660  BaseShiftDistanceCorrection corr = {base_shift};
661  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
662 }
663 
664 
665 
666 /***************************************************************************
667  * compute a subset of distances
668  ***************************************************************************/
669 
670 /* compute the inner product between x and a subset y of ny vectors,
671  whose indices are given by idy. */
672 void fvec_inner_products_by_idx (float * __restrict ip,
673  const float * x,
674  const float * y,
675  const long * __restrict ids, /* for y vecs */
676  size_t d, size_t nx, size_t ny)
677 {
678 #pragma omp parallel for
679  for (size_t j = 0; j < nx; j++) {
680  const long * __restrict idsj = ids + j * ny;
681  const float * xj = x + j * d;
682  float * __restrict ipj = ip + j * ny;
683  for (size_t i = 0; i < ny; i++) {
684  if (idsj[i] < 0)
685  continue;
686  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
687  }
688  }
689 }
690 
691 /* compute the inner product between x and a subset y of ny vectors,
692  whose indices are given by idy. */
693 void fvec_L2sqr_by_idx (float * __restrict dis,
694  const float * x,
695  const float * y,
696  const long * __restrict ids, /* ids of y vecs */
697  size_t d, size_t nx, size_t ny)
698 {
699 #pragma omp parallel for
700  for (size_t j = 0; j < nx; j++) {
701  const long * __restrict idsj = ids + j * ny;
702  const float * xj = x + j * d;
703  float * __restrict disj = dis + j * ny;
704  for (size_t i = 0; i < ny; i++) {
705  if (idsj[i] < 0)
706  continue;
707  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
708  }
709  }
710 }
711 
712 
713 
714 
715 
716 /* Find the nearest neighbors for nx queries in a set of ny vectors
717  indexed by ids. May be useful for re-ranking a pre-selected vector list */
718 void knn_inner_products_by_idx (const float * x,
719  const float * y,
720  const long * ids,
721  size_t d, size_t nx, size_t ny,
722  float_minheap_array_t * res)
723 {
724  size_t k = res->k;
725 
726 #pragma omp parallel for
727  for (size_t i = 0; i < nx; i++) {
728  const float * x_ = x + i * d;
729  const long * idsi = ids + i * ny;
730  size_t j;
731  float * __restrict simi = res->get_val(i);
732  long * __restrict idxi = res->get_ids (i);
733  minheap_heapify (k, simi, idxi);
734 
735  for (j = 0; j < ny; j++) {
736  if (idsi[j] < 0) break;
737  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
738 
739  if (ip > simi[0]) {
740  minheap_pop (k, simi, idxi);
741  minheap_push (k, simi, idxi, ip, idsi[j]);
742  }
743  }
744  minheap_reorder (k, simi, idxi);
745  }
746 
747 }
748 
749 void knn_L2sqr_by_idx (const float * x,
750  const float * y,
751  const long * __restrict ids,
752  size_t d, size_t nx, size_t ny,
753  float_maxheap_array_t * res)
754 {
755  size_t k = res->k;
756 
757 #pragma omp parallel for
758  for (size_t i = 0; i < nx; i++) {
759  const float * x_ = x + i * d;
760  const long * __restrict idsi = ids + i * ny;
761  float * __restrict simi = res->get_val(i);
762  long * __restrict idxi = res->get_ids (i);
763  maxheap_heapify (res->k, simi, idxi);
764  for (size_t j = 0; j < ny; j++) {
765  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
766 
767  if (disij < simi[0]) {
768  maxheap_pop (k, simi, idxi);
769  maxheap_push (k, simi, idxi, disij, idsi[j]);
770  }
771  }
772  maxheap_reorder (res->k, simi, idxi);
773  }
774 
775 }
776 
777 
778 
779 
780 
781 /***************************************************************************
782  * Range search
783  ***************************************************************************/
784 
785 /** Find the nearest neighbors for nx queries in a set of ny vectors
786  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
787  */
788 template <bool compute_l2>
789 static void range_search_blas (
790  const float * x,
791  const float * y,
792  size_t d, size_t nx, size_t ny,
793  float radius,
794  RangeSearchResult *result)
795 {
796 
797  // BLAS does not like empty matrices
798  if (nx == 0 || ny == 0) return;
799 
800  /* block sizes */
801  const size_t bs_x = 4096, bs_y = 1024;
802  // const size_t bs_x = 16, bs_y = 16;
803  float *ip_block = new float[bs_x * bs_y];
804 
805  float *x_norms = nullptr, *y_norms = nullptr;
806 
807  if (compute_l2) {
808  x_norms = new float[nx];
809  fvec_norms_L2sqr (x_norms, x, d, nx);
810  y_norms = new float[ny];
811  fvec_norms_L2sqr (y_norms, y, d, ny);
812  }
813 
814  std::vector <RangeSearchPartialResult *> partial_results;
815 
816  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
817  size_t j1 = j0 + bs_y;
818  if (j1 > ny) j1 = ny;
819  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
820  partial_results.push_back (pres);
821 
822  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
823  size_t i1 = i0 + bs_x;
824  if(i1 > nx) i1 = nx;
825 
826  /* compute the actual dot products */
827  {
828  float one = 1, zero = 0;
829  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
830  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
831  y + j0 * d, &di,
832  x + i0 * d, &di, &zero,
833  ip_block, &nyi);
834  }
835 
836 
837  for (size_t i = i0; i < i1; i++) {
838  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
839 
840  RangeSearchPartialResult::QueryResult & qres =
841  pres->new_result (i);
842 
843  for (size_t j = j0; j < j1; j++) {
844  float ip = *ip_line++;
845  if (compute_l2) {
846  float dis = x_norms[i] + y_norms[j] - 2 * ip;
847  if (dis < radius) {
848  qres.add (dis, j);
849  }
850  } else {
851  if (ip > radius) {
852  qres.add (ip, j);
853  }
854  }
855  }
856  }
857  }
858 
859  }
860  delete [] ip_block;
861  delete [] x_norms;
862  delete [] y_norms;
863 
864  { // merge the partial results
865  int npres = partial_results.size();
866  // count
867  for (size_t i = 0; i < nx; i++) {
868  for (int j = 0; j < npres; j++)
869  result->lims[i] += partial_results[j]->queries[i].nres;
870  }
871  result->do_allocation ();
872  for (int j = 0; j < npres; j++) {
873  partial_results[j]->set_result (true);
874  delete partial_results[j];
875  }
876 
877  // reset the limits
878  for (size_t i = nx; i > 0; i--) {
879  result->lims [i] = result->lims [i - 1];
880  }
881  result->lims [0] = 0;
882  }
883 }
884 
885 
886 template <bool compute_l2>
887 static void range_search_sse (const float * x,
888  const float * y,
889  size_t d, size_t nx, size_t ny,
890  float radius,
891  RangeSearchResult *res)
892 {
893  FAISS_THROW_IF_NOT (d % 4 == 0);
894 
895 #pragma omp parallel
896  {
897  RangeSearchPartialResult pres (res);
898 
899 #pragma omp for
900  for (size_t i = 0; i < nx; i++) {
901  const float * x_ = x + i * d;
902  const float * y_ = y;
903  size_t j;
904 
905  RangeSearchPartialResult::QueryResult & qres =
906  pres.new_result (i);
907 
908  for (j = 0; j < ny; j++) {
909  if (compute_l2) {
910  float disij = fvec_L2sqr (x_, y_, d);
911  if (disij < radius) {
912  qres.add (disij, j);
913  }
914  } else {
915  float ip = fvec_inner_product (x_, y_, d);
916  if (ip > radius) {
917  qres.add (ip, j);
918  }
919  }
920  y_ += d;
921  }
922 
923  }
924  pres.finalize ();
925  }
926 }
927 
928 
929 
930 
931 
933  const float * x,
934  const float * y,
935  size_t d, size_t nx, size_t ny,
936  float radius,
937  RangeSearchResult *res)
938 {
939 
940  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
941  range_search_sse<true> (x, y, d, nx, ny, radius, res);
942  } else {
943  range_search_blas<true> (x, y, d, nx, ny, radius, res);
944  }
945 }
946 
948  const float * x,
949  const float * y,
950  size_t d, size_t nx, size_t ny,
951  float radius,
952  RangeSearchResult *res)
953 {
954 
955  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
956  range_search_sse<false> (x, y, d, nx, ny, radius, res);
957  } else {
958  range_search_blas<false> (x, y, d, nx, ny, radius, res);
959  }
960 }
961 
962 
963 
964 /***************************************************************************
965  * Some matrix manipulation functions
966  ***************************************************************************/
967 
968 
969 /* This function exists because the Torch counterpart is extremly slow
970  (not multi-threaded + unexpected overhead even in single thread).
971  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
972 void inner_product_to_L2sqr (float * __restrict dis,
973  const float * nr1,
974  const float * nr2,
975  size_t n1, size_t n2)
976 {
977 
978 #pragma omp parallel for
979  for (size_t j = 0 ; j < n1 ; j++) {
980  float * disj = dis + j * n2;
981  for (size_t i = 0 ; i < n2 ; i++)
982  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
983  }
984 }
985 
986 
987 void matrix_qr (int m, int n, float *a)
988 {
989  FAISS_THROW_IF_NOT (m >= n);
990  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
991  std::vector<float> tau (ki);
992  FINTEGER lwork = -1, info;
993  float work_size;
994 
995  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
996  &work_size, &lwork, &info);
997  lwork = size_t(work_size);
998  std::vector<float> work (lwork);
999 
1000  sgeqrf_ (&mi, &ni, a, &mi,
1001  tau.data(), work.data(), &lwork, &info);
1002 
1003  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1004  work.data(), &lwork, &info);
1005 
1006 }
1007 
1008 
1009 void pairwise_L2sqr (long d,
1010  long nq, const float *xq,
1011  long nb, const float *xb,
1012  float *dis,
1013  long ldq, long ldb, long ldd)
1014 {
1015  if (nq == 0 || nb == 0) return;
1016  if (ldq == -1) ldq = d;
1017  if (ldb == -1) ldb = d;
1018  if (ldd == -1) ldd = nb;
1019 
1020  // store in beginning of distance matrix to avoid malloc
1021  float *b_norms = dis;
1022 
1023 #pragma omp parallel for
1024  for (long i = 0; i < nb; i++)
1025  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1026 
1027 #pragma omp parallel for
1028  for (long i = 1; i < nq; i++) {
1029  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1030  for (long j = 0; j < nb; j++)
1031  dis[i * ldd + j] = q_norm + b_norms [j];
1032  }
1033 
1034  {
1035  float q_norm = fvec_norm_L2sqr (xq, d);
1036  for (long j = 0; j < nb; j++)
1037  dis[j] += q_norm;
1038  }
1039 
1040  {
1041  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1042  float one = 1.0, minus_2 = -2.0;
1043 
1044  sgemm_ ("Transposed", "Not transposed",
1045  &nbi, &nqi, &di,
1046  &minus_2,
1047  xb, &ldbi,
1048  xq, &ldqi,
1049  &one, dis, &lddi);
1050  }
1051 
1052 
1053 }
1054 
1055 
1056 
1057 /***************************************************************************
1058  * Kmeans subroutine
1059  ***************************************************************************/
1060 
1061 // a bit above machine epsilon for float16
1062 
1063 #define EPS (1 / 1024.)
1064 
1065 /* For k-means, compute centroids given assignment of vectors to centroids */
1066 int km_update_centroids (const float * x,
1067  float * centroids,
1068  long * assign,
1069  size_t d, size_t k, size_t n,
1070  size_t k_frozen)
1071 {
1072  k -= k_frozen;
1073  centroids += k_frozen * d;
1074 
1075  std::vector<size_t> hassign(k);
1076  memset (centroids, 0, sizeof(*centroids) * d * k);
1077 
1078 #pragma omp parallel
1079  {
1080  int nt = omp_get_num_threads();
1081  int rank = omp_get_thread_num();
1082  // this thread is taking care of centroids c0:c1
1083  size_t c0 = (k * rank) / nt;
1084  size_t c1 = (k * (rank + 1)) / nt;
1085  const float *xi = x;
1086  size_t nacc = 0;
1087 
1088  for (size_t i = 0; i < n; i++) {
1089  long ci = assign[i];
1090  assert (ci >= 0 && ci < k + k_frozen);
1091  ci -= k_frozen;
1092  if (ci >= c0 && ci < c1) {
1093  float * c = centroids + ci * d;
1094  hassign[ci]++;
1095  for (size_t j = 0; j < d; j++)
1096  c[j] += xi[j];
1097  nacc++;
1098  }
1099  xi += d;
1100  }
1101 
1102  }
1103 
1104 #pragma omp parallel for
1105  for (size_t ci = 0; ci < k; ci++) {
1106  float * c = centroids + ci * d;
1107  float ni = (float) hassign[ci];
1108  if (ni != 0) {
1109  for (size_t j = 0; j < d; j++)
1110  c[j] /= ni;
1111  }
1112  }
1113 
1114  /* Take care of void clusters */
1115  size_t nsplit = 0;
1116  RandomGenerator rng (1234);
1117  for (size_t ci = 0; ci < k; ci++) {
1118  if (hassign[ci] == 0) { /* need to redefine a centroid */
1119  size_t cj;
1120  for (cj = 0; 1; cj = (cj + 1) % k) {
1121  /* probability to pick this cluster for split */
1122  float p = (hassign[cj] - 1.0) / (float) (n - k);
1123  float r = rng.rand_float ();
1124  if (r < p) {
1125  break; /* found our cluster to be split */
1126  }
1127  }
1128  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1129 
1130  /* small symmetric pertubation. Much better than */
1131  for (size_t j = 0; j < d; j++) {
1132  if (j % 2 == 0) {
1133  centroids[ci * d + j] *= 1 + EPS;
1134  centroids[cj * d + j] *= 1 - EPS;
1135  } else {
1136  centroids[ci * d + j] *= 1 - EPS;
1137  centroids[cj * d + j] *= 1 + EPS;
1138  }
1139  }
1140 
1141  /* assume even split of the cluster */
1142  hassign[ci] = hassign[cj] / 2;
1143  hassign[cj] -= hassign[ci];
1144  nsplit++;
1145  }
1146  }
1147 
1148  return nsplit;
1149 }
1150 
1151 #undef EPS
1152 
1153 
1154 
1155 /***************************************************************************
1156  * Result list routines
1157  ***************************************************************************/
1158 
1159 
1160 void ranklist_handle_ties (int k, long *idx, const float *dis)
1161 {
1162  float prev_dis = -1e38;
1163  int prev_i = -1;
1164  for (int i = 0; i < k; i++) {
1165  if (dis[i] != prev_dis) {
1166  if (i > prev_i + 1) {
1167  // sort between prev_i and i - 1
1168  std::sort (idx + prev_i, idx + i);
1169  }
1170  prev_i = i;
1171  prev_dis = dis[i];
1172  }
1173  }
1174 }
1175 
1176 size_t merge_result_table_with (size_t n, size_t k,
1177  long *I0, float *D0,
1178  const long *I1, const float *D1,
1179  bool keep_min,
1180  long translation)
1181 {
1182  size_t n1 = 0;
1183 
1184 #pragma omp parallel reduction(+:n1)
1185  {
1186  std::vector<long> tmpI (k);
1187  std::vector<float> tmpD (k);
1188 
1189 #pragma omp for
1190  for (size_t i = 0; i < n; i++) {
1191  long *lI0 = I0 + i * k;
1192  float *lD0 = D0 + i * k;
1193  const long *lI1 = I1 + i * k;
1194  const float *lD1 = D1 + i * k;
1195  size_t r0 = 0;
1196  size_t r1 = 0;
1197 
1198  if (keep_min) {
1199  for (size_t j = 0; j < k; j++) {
1200 
1201  if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1202  tmpD[j] = lD0[r0];
1203  tmpI[j] = lI0[r0];
1204  r0++;
1205  } else if (lD1[r1] >= 0) {
1206  tmpD[j] = lD1[r1];
1207  tmpI[j] = lI1[r1] + translation;
1208  r1++;
1209  } else { // both are NaNs
1210  tmpD[j] = NAN;
1211  tmpI[j] = -1;
1212  }
1213  }
1214  } else {
1215  for (size_t j = 0; j < k; j++) {
1216  if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1217  tmpD[j] = lD0[r0];
1218  tmpI[j] = lI0[r0];
1219  r0++;
1220  } else if (lD1[r1] >= 0) {
1221  tmpD[j] = lD1[r1];
1222  tmpI[j] = lI1[r1] + translation;
1223  r1++;
1224  } else { // both are NaNs
1225  tmpD[j] = NAN;
1226  tmpI[j] = -1;
1227  }
1228  }
1229  }
1230  n1 += r1;
1231  memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
1232  memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
1233  }
1234  }
1235 
1236  return n1;
1237 }
1238 
1239 
1240 
1241 size_t ranklist_intersection_size (size_t k1, const long *v1,
1242  size_t k2, const long *v2_in)
1243 {
1244  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1245  long *v2 = new long [k2];
1246  memcpy (v2, v2_in, sizeof (long) * k2);
1247  std::sort (v2, v2 + k2);
1248  { // de-dup v2
1249  long prev = -1;
1250  size_t wp = 0;
1251  for (size_t i = 0; i < k2; i++) {
1252  if (v2 [i] != prev) {
1253  v2[wp++] = prev = v2 [i];
1254  }
1255  }
1256  k2 = wp;
1257  }
1258  const long seen_flag = 1L << 60;
1259  size_t count = 0;
1260  for (size_t i = 0; i < k1; i++) {
1261  long q = v1 [i];
1262  size_t i0 = 0, i1 = k2;
1263  while (i0 + 1 < i1) {
1264  size_t imed = (i1 + i0) / 2;
1265  long piv = v2 [imed] & ~seen_flag;
1266  if (piv <= q) i0 = imed;
1267  else i1 = imed;
1268  }
1269  if (v2 [i0] == q) {
1270  count++;
1271  v2 [i0] |= seen_flag;
1272  }
1273  }
1274  delete [] v2;
1275 
1276  return count;
1277 }
1278 
1279 double imbalance_factor (int k, const int *hist) {
1280  double tot = 0, uf = 0;
1281 
1282  for (int i = 0 ; i < k ; i++) {
1283  tot += hist[i];
1284  uf += hist[i] * (double) hist[i];
1285  }
1286  uf = uf * k / (tot * tot);
1287 
1288  return uf;
1289 }
1290 
1291 
1292 double imbalance_factor (int n, int k, const long *assign) {
1293  std::vector<int> hist(k, 0);
1294  for (int i = 0; i < n; i++) {
1295  hist[assign[i]]++;
1296  }
1297 
1298  return imbalance_factor (k, hist.data());
1299 }
1300 
1301 
1302 
1303 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1304  memset (hist, 0, sizeof(hist[0]) * vmax);
1305  int nout = 0;
1306  while (n--) {
1307  if (v[n] < 0 || v[n] >= vmax) nout++;
1308  else hist[v[n]]++;
1309  }
1310  return nout;
1311 }
1312 
1313 
1314 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1315 {
1316  FAISS_THROW_IF_NOT (nbits % 8 == 0);
1317  size_t d = nbits / 8;
1318  std::vector<int> accu(d * 256);
1319  const uint8_t *c = codes;
1320  for (size_t i = 0; i < n; i++)
1321  for(int j = 0; j < d; j++)
1322  accu[j * 256 + *c++]++;
1323  memset (hist, 0, sizeof(*hist) * nbits);
1324  for (int i = 0; i < d; i++) {
1325  const int *ai = accu.data() + i * 256;
1326  int * hi = hist + i * 8;
1327  for (int j = 0; j < 256; j++)
1328  for (int k = 0; k < 8; k++)
1329  if ((j >> k) & 1)
1330  hi[k] += ai[j];
1331  }
1332 
1333 }
1334 
1335 
1336 
1337 size_t ivec_checksum (size_t n, const int *a)
1338 {
1339  size_t cs = 112909;
1340  while (n--) cs = cs * 65713 + a[n] * 1686049;
1341  return cs;
1342 }
1343 
1344 
1345 namespace {
1346  struct ArgsortComparator {
1347  const float *vals;
1348  bool operator() (const size_t a, const size_t b) const {
1349  return vals[a] < vals[b];
1350  }
1351  };
1352 
1353  struct SegmentS {
1354  size_t i0; // begin pointer in the permutation array
1355  size_t i1; // end
1356  size_t len() const {
1357  return i1 - i0;
1358  }
1359  };
1360 
1361  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1362  // extended to > 1 merge thread
1363 
1364  // merges 2 ranges that should be consecutive on the source into
1365  // the union of the two on the destination
1366  template<typename T>
1367  void parallel_merge (const T *src, T *dst,
1368  SegmentS &s1, SegmentS & s2, int nt,
1369  const ArgsortComparator & comp) {
1370  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1371  std::swap(s1, s2);
1372  }
1373 
1374  // compute sub-ranges for each thread
1375  SegmentS s1s[nt], s2s[nt], sws[nt];
1376  s2s[0].i0 = s2.i0;
1377  s2s[nt - 1].i1 = s2.i1;
1378 
1379  // not sure parallel actually helps here
1380 #pragma omp parallel for num_threads(nt)
1381  for (int t = 0; t < nt; t++) {
1382  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1383  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1384 
1385  if (t + 1 < nt) {
1386  T pivot = src[s1s[t].i1];
1387  size_t i0 = s2.i0, i1 = s2.i1;
1388  while (i0 + 1 < i1) {
1389  size_t imed = (i1 + i0) / 2;
1390  if (comp (pivot, src[imed])) {i1 = imed; }
1391  else {i0 = imed; }
1392  }
1393  s2s[t].i1 = s2s[t + 1].i0 = i1;
1394  }
1395  }
1396  s1.i0 = std::min(s1.i0, s2.i0);
1397  s1.i1 = std::max(s1.i1, s2.i1);
1398  s2 = s1;
1399  sws[0].i0 = s1.i0;
1400  for (int t = 0; t < nt; t++) {
1401  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1402  if (t + 1 < nt) {
1403  sws[t + 1].i0 = sws[t].i1;
1404  }
1405  }
1406  assert(sws[nt - 1].i1 == s1.i1);
1407 
1408  // do the actual merging
1409 #pragma omp parallel for num_threads(nt)
1410  for (int t = 0; t < nt; t++) {
1411  SegmentS sw = sws[t];
1412  SegmentS s1t = s1s[t];
1413  SegmentS s2t = s2s[t];
1414  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1415  for (;;) {
1416  // assert (sw.len() == s1t.len() + s2t.len());
1417  if (comp(src[s1t.i0], src[s2t.i0])) {
1418  dst[sw.i0++] = src[s1t.i0++];
1419  if (s1t.i0 == s1t.i1) break;
1420  } else {
1421  dst[sw.i0++] = src[s2t.i0++];
1422  if (s2t.i0 == s2t.i1) break;
1423  }
1424  }
1425  }
1426  if (s1t.len() > 0) {
1427  assert(s1t.len() == sw.len());
1428  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1429  } else if (s2t.len() > 0) {
1430  assert(s2t.len() == sw.len());
1431  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1432  }
1433  }
1434  }
1435 
1436 };
1437 
1438 void fvec_argsort (size_t n, const float *vals,
1439  size_t *perm)
1440 {
1441  for (size_t i = 0; i < n; i++) perm[i] = i;
1442  ArgsortComparator comp = {vals};
1443  std::sort (perm, perm + n, comp);
1444 }
1445 
1446 void fvec_argsort_parallel (size_t n, const float *vals,
1447  size_t *perm)
1448 {
1449  size_t * perm2 = new size_t[n];
1450  // 2 result tables, during merging, flip between them
1451  size_t *permB = perm2, *permA = perm;
1452 
1453  int nt = omp_get_max_threads();
1454  { // prepare correct permutation so that the result ends in perm
1455  // at final iteration
1456  int nseg = nt;
1457  while (nseg > 1) {
1458  nseg = (nseg + 1) / 2;
1459  std::swap (permA, permB);
1460  }
1461  }
1462 
1463 #pragma omp parallel
1464  for (size_t i = 0; i < n; i++) permA[i] = i;
1465 
1466  ArgsortComparator comp = {vals};
1467 
1468  SegmentS segs[nt];
1469 
1470  // independent sorts
1471 #pragma omp parallel for
1472  for (int t = 0; t < nt; t++) {
1473  size_t i0 = t * n / nt;
1474  size_t i1 = (t + 1) * n / nt;
1475  SegmentS seg = {i0, i1};
1476  std::sort (permA + seg.i0, permA + seg.i1, comp);
1477  segs[t] = seg;
1478  }
1479  int prev_nested = omp_get_nested();
1480  omp_set_nested(1);
1481 
1482  int nseg = nt;
1483  while (nseg > 1) {
1484  int nseg1 = (nseg + 1) / 2;
1485  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1486  int sub_nseg1 = nseg / 2;
1487 
1488 #pragma omp parallel for num_threads(nseg1)
1489  for (int s = 0; s < nseg; s += 2) {
1490  if (s + 1 == nseg) { // otherwise isolated segment
1491  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1492  segs[s].len() * sizeof(size_t));
1493  } else {
1494  int t0 = s * sub_nt / sub_nseg1;
1495  int t1 = (s + 1) * sub_nt / sub_nseg1;
1496  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1497  parallel_merge(permA, permB, segs[s], segs[s + 1],
1498  t1 - t0, comp);
1499  }
1500  }
1501  for (int s = 0; s < nseg; s += 2)
1502  segs[s / 2] = segs[s];
1503  nseg = nseg1;
1504  std::swap (permA, permB);
1505  }
1506  assert (permA == perm);
1507  omp_set_nested(prev_nested);
1508  delete [] perm2;
1509 }
1510 
1511 
1512 
1513 
1514 
1515 
1516 
1517 
1518 
1519 
1520 
1521 
1522 
1523 
1524 
1525 
1526 
1527 
1529  size_t d, size_t *n, size_t nmax, const float *x,
1530  bool verbose, long seed)
1531 {
1532 
1533  if (*n <= nmax) return x; // nothing to do
1534 
1535  size_t n2 = nmax;
1536  if (verbose) {
1537  printf (" Input training set too big (max size is %ld), sampling "
1538  "%ld / %ld vectors\n", nmax, n2, *n);
1539  }
1540  std::vector<int> subset (*n);
1541  rand_perm (subset.data (), *n, seed);
1542  float *x_subset = new float[n2 * d];
1543  for (long i = 0; i < n2; i++)
1544  memcpy (&x_subset[i * d],
1545  &x[subset[i] * size_t(d)],
1546  sizeof (x[0]) * d);
1547  *n = n2;
1548  return x_subset;
1549 }
1550 
1551 
1552 void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
1553  for (size_t i = 0; i < d; ++i) {
1554  x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
1555  }
1556 }
1557 
1558 void real_to_binary(size_t d, const float *x_in, uint8_t *x_out) {
1559  for (size_t i = 0; i < d / 8; ++i) {
1560  uint8_t b = 0;
1561  for (int j = 0; j < 8; ++j) {
1562  if (x_in[8 * i + j] > 0) {
1563  b |= (1 << j);
1564  }
1565  }
1566  x_out[i] = b;
1567  }
1568 }
1569 
1570 
1571 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:48
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1066
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:653
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:502
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1314
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1528
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1160
float rand_float()
between 0 and 1
Definition: utils.cpp:130
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1303
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1176
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1241
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1009
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:947
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:613
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:70
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
Definition: utils.cpp:1558
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1279
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils_simd.cpp:516
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:932
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:987
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1337
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:633
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
Definition: utils.cpp:1552